Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .ipynb_checkpoints/transformer-checkpoint.ipynb +489 -0
- data.txt +0 -0
- harry_potter_transformer.keras +3 -0
- transformer.ipynb +489 -0
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
36 |
harry_potter_transformer.keras filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,489 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 2,
6 |
"id": "7c710f0a-59f2-445c-9464-d702fe44fe7a",
7 |
"metadata": {},
8 |
"outputs": [
9 |
10 |
"name": "stdout",
11 |
"output_type": "stream",
12 |
"text": [
13 |
"Num GPUs Available: 1\n"
14 |
15 |
16 |
17 |
"source": [
18 |
"import tensorflow as tf\n",
19 |
"print(\"Num GPUs Available:\", len(tf.config.list_physical_devices('GPU')))"
20 |
21 |
22 |
23 |
"cell_type": "code",
24 |
"execution_count": 3,
25 |
"id": "33d41ac0-0a70-4b7f-9c00-5b1bcbcd1c9d",
26 |
"metadata": {},
27 |
"outputs": [],
28 |
"source": [
29 |
"import numpy as np\n",
30 |
"import tensorflow as tf\n",
31 |
"from tensorflow.keras.preprocessing.text import Tokenizer"
32 |
33 |
34 |
35 |
"cell_type": "code",
36 |
"execution_count": 4,
37 |
"id": "2e794897-5d68-44e5-bc1a-111a6232ce26",
38 |
"metadata": {},
39 |
"outputs": [
40 |
41 |
"name": "stdout",
42 |
"output_type": "stream",
43 |
"text": [
44 |
45 |
46 |
47 |
48 |
"source": [
49 |
"import sys\n",
50 |
51 |
52 |
53 |
54 |
"cell_type": "code",
55 |
"execution_count": 5,
56 |
"id": "8c8b6b39-3b6a-4e85-b446-2c5acacbd3e0",
57 |
"metadata": {},
58 |
"outputs": [
59 |
60 |
"ename": "FileNotFoundError",
61 |
"evalue": "[Errno 2] No such file or directory: '1.txt'",
62 |
"output_type": "error",
63 |
"traceback": [
64 |
65 |
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
66 |
"Cell \u001b[0;32mIn[5], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m data \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n\u001b[0;32m----> 6\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m1.txt\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mlower()\n",
67 |
"Cell \u001b[0;32mIn[5], line 2\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload_data\u001b[39m(file_path):\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 3\u001b[0m data \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
68 |
"File \u001b[0;32m/opt/miniconda3/envs/tf-metal2/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 305\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 307\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 308\u001b[0m )\n\u001b[0;32m--> 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
69 |
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '1.txt'"
70 |
71 |
72 |
73 |
"source": [
74 |
"def load_data(file_path):\n",
75 |
" with open(file_path, 'r') as f:\n",
76 |
" data = f.read()\n",
77 |
" return data\n",
78 |
79 |
"data = load_data('data.txt').lower()"
80 |
81 |
82 |
83 |
"cell_type": "code",
84 |
"execution_count": null,
85 |
"id": "573b0963-aa70-44de-86ab-33ba19d5148a",
86 |
"metadata": {},
87 |
"outputs": [],
88 |
"source": [
89 |
"tokenizer = Tokenizer(oov_token='<OOV>')\n",
90 |
91 |
"total_words_in_dict = len(tokenizer.word_index) + 1\n",
92 |
93 |
94 |
95 |
96 |
"cell_type": "code",
97 |
"execution_count": null,
98 |
"id": "cda10ef1-d1c2-4025-b66f-7d2325526df9",
99 |
"metadata": {},
100 |
"outputs": [],
101 |
"source": [
102 |
"tokenizer.word_index['<OOV>'], tokenizer.word_index['harry']"
103 |
104 |
105 |
106 |
"cell_type": "code",
107 |
"execution_count": null,
108 |
"id": "8d52769c-58d8-4ea2-a4e0-9664d5a2da9d",
109 |
"metadata": {},
110 |
"outputs": [],
111 |
"source": [
112 |
"# tokens basically is the entire text from first to last converted into their\n",
113 |
"# index representation\n",
114 |
"tokens = tokenizer.texts_to_sequences([data])[0]"
115 |
116 |
117 |
118 |
"cell_type": "code",
119 |
"execution_count": null,
120 |
"id": "03976234-376f-4b24-bab0-a7040c6760a3",
121 |
"metadata": {},
122 |
"outputs": [],
123 |
"source": [
124 |
"# this creates lists of length 51 (seq_len + 1)\n",
125 |
"# 1-51, 2-52, 3-53, etc.\n",
126 |
"# 51 so that the last value is used as y\n",
127 |
"seq_length = 50\n",
128 |
"input_sequences = []\n",
129 |
"for i in range(seq_length, len(tokens)):\n",
130 |
" input_sequences.append(tokens[i - seq_length: i + 1])"
131 |
132 |
133 |
134 |
"cell_type": "code",
135 |
"execution_count": null,
136 |
"id": "e49c6da4-64c0-4bc7-9526-6f3df699002a",
137 |
"metadata": {},
138 |
"outputs": [],
139 |
"source": [
140 |
"# this ensures all the lists are of same length\n",
141 |
"# here as well we need seq_len + 1 as the previous block\n",
142 |
"from tensorflow.keras.utils import pad_sequences\n",
143 |
144 |
"final_input = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))\n",
145 |
146 |
147 |
148 |
149 |
"cell_type": "code",
150 |
"execution_count": null,
151 |
"id": "83639aac-6ad1-4494-ac0c-b54a59e39025",
152 |
"metadata": {},
153 |
"outputs": [],
154 |
"source": [
155 |
"# create x and y, last value of each list is the prediction\n",
156 |
"# imagine sliding window\n",
157 |
"X, y = final_input[:, :-1], final_input[:, -1]\n",
158 |
"print('X : ', X[0], 'Y: ', y[0])"
159 |
160 |
161 |
162 |
"cell_type": "code",
163 |
"execution_count": null,
164 |
"id": "70d67b34-401d-425a-bc37-3b58863ccc4c",
165 |
"metadata": {},
166 |
"outputs": [],
167 |
"source": [
168 |
"# if you print y, it will be integer values like 46, 274, etc.\n",
169 |
"# we need categorical, also it can belong to any word from the entire\n",
170 |
"# dict , we will generate probs and find crossentropy\n",
171 |
"y = tf.keras.utils.to_categorical(y, num_classes=total_words_in_dict)\n",
172 |
"y[0], y.shape"
173 |
174 |
175 |
176 |
"cell_type": "code",
177 |
"execution_count": null,
178 |
"id": "a8678c88-b1fa-4d0b-9be6-e3fb27413d17",
179 |
"metadata": {},
180 |
"outputs": [],
181 |
"source": [
182 |
"# the shape will be number of lists x seq_len\n",
183 |
"X.shape, y.shape"
184 |
185 |
186 |
187 |
"cell_type": "code",
188 |
"execution_count": null,
189 |
"id": "3b03bba6-6819-4282-b2e2-5d5219905eda",
190 |
"metadata": {},
191 |
"outputs": [],
192 |
"source": [
193 |
"from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding\n",
194 |
195 |
"class MultiHeadAttention(Layer):\n",
196 |
" def __init__(self, seq_length, num_heads, embed_dim):\n",
197 |
" super(MultiHeadAttention, self).__init__()\n",
198 |
199 |
" self.seq_length = seq_length\n",
200 |
" self.num_heads = num_heads\n",
201 |
" self.embed_dim = embed_dim\n",
202 |
203 |
" self.projection_dim = embed_dim // num_heads\n",
204 |
205 |
" self.query = Dense(embed_dim)\n",
206 |
" self.key = Dense(embed_dim)\n",
207 |
" self.value = Dense(embed_dim)\n",
208 |
209 |
" # need this to learn the interaction between the features learnt by all\n",
210 |
" # the different heads\n",
211 |
" self.combine_heads_layer = Dense(embed_dim)\n",
212 |
213 |
" def split_heads(self, input):\n",
214 |
" batch_size = tf.shape(input)[0]\n",
215 |
" x = tf.reshape(input, (batch_size, -1, self.num_heads, self.projection_dim))\n",
216 |
" return tf.transpose(x, perm=[0, 2, 1, 3])\n",
217 |
218 |
" def self_attention(self, query, key, value):\n",
219 |
" score = tf.matmul(query, key, transpose_b=True)\n",
220 |
" scaled_score = score / tf.math.sqrt(tf.cast(self.projection_dim, tf.float32))\n",
221 |
" weights = tf.nn.softmax(scaled_score, axis=-1) # row wise in QKt\n",
222 |
223 |
" return tf.matmul(weights, value), weights\n",
224 |
225 |
226 |
" def call(self, x):\n",
227 |
" batch_size = tf.shape(x)[0]\n",
228 |
229 |
" # finds the weights matrix then split across heads\n",
230 |
" # it is more efficient computationally if we find the weight matrix\n",
231 |
" # across all the heads first then split to find individual attention scores\n",
232 |
" query = self.split_heads(self.query(x))\n",
233 |
" key = self.split_heads(self.key(x))\n",
234 |
" value = self.split_heads(self.value(x))\n",
235 |
236 |
" attention, _ = self.self_attention(query, key, value)\n",
237 |
" # attention is of size [batch_size, num_heads, seq_length, proj_dim]\n",
238 |
239 |
" attention = tf.transpose(attention, perm=[0, 2, 1, 3])\n",
240 |
" # attention is of size [batch_size, seq_length, num_heads, proj_dim]\n",
241 |
242 |
" concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))\n",
243 |
244 |
" return self.combine_heads_layer(concat_attention)\n",
245 |
246 |
247 |
248 |
"class TransformerBlock(Layer):\n",
249 |
" def __init__(self, seq_length, embed_dim, ffn_dim):\n",
250 |
" super(TransformerBlock, self).__init__()\n",
251 |
252 |
" self.seq_length = seq_length\n",
253 |
" self.embed_dim = embed_dim\n",
254 |
" self.ffn = tf.keras.Sequential([\n",
255 |
" Dense(ffn_dim, activation='relu'),\n",
256 |
" Dense(embed_dim)\n",
257 |
" ])\n",
258 |
259 |
" self.attn = MultiHeadAttention(seq_length, 8, embed_dim)\n",
260 |
261 |
" self.LayerNorm1 = LayerNormalization(epsilon=1e-6) # prevent divide by 0\n",
262 |
" self.LayerNorm2 = LayerNormalization(epsilon=1e-6)\n",
263 |
264 |
" self.Drop1 = Dropout(0.1)\n",
265 |
" self.Drop2 = Dropout(0.1)\n",
266 |
267 |
268 |
" def call(self, x, isTraining):\n",
269 |
" attention_output = self.attn(x)\n",
270 |
" print(attention_output.shape)\n",
271 |
" x = self.LayerNorm1(x + self.Drop1(attention_output, training=isTraining))\n",
272 |
" ffn_output = self.ffn(x)\n",
273 |
" x = self.LayerNorm2(x + self.Drop2(ffn_output, training=isTraining))\n",
274 |
" return x\n",
275 |
276 |
"class TokenAndPositionEmbedding(Layer):\n",
277 |
" def __init__(self, seq_length, total_words_in_dict, embed_dim):\n",
278 |
" super(TokenAndPositionEmbedding, self).__init__()\n",
279 |
280 |
" self.seq_length = seq_length\n",
281 |
" self.emb = Embedding(input_dim=total_words_in_dict, output_dim=embed_dim)\n",
282 |
" self.pos_emb = Embedding(input_dim=seq_length, output_dim=embed_dim)\n",
283 |
284 |
" def call(self, x):\n",
285 |
" positions = tf.range(start=0, limit=self.seq_length, delta=1)\n",
286 |
" positions = self.pos_emb(positions)\n",
287 |
" x = self.emb(x)\n",
288 |
" return x + positions"
289 |
290 |
291 |
292 |
"cell_type": "code",
293 |
"execution_count": null,
294 |
"id": "19e33d70-6984-44c2-bfe2-f525444bdf01",
295 |
"metadata": {},
296 |
"outputs": [],
297 |
"source": [
298 |
"ff_dim = 512\n",
299 |
"embed_dim = 256\n",
300 |
301 |
" # This is a placeholder in functional api style\n",
302 |
" # batch_size is taken during .fit() phase\n",
303 |
"input_placeholder = tf.keras.Input(shape=(seq_length,))\n",
304 |
305 |
"tokenPosLayer = TokenAndPositionEmbedding(seq_length, total_words_in_dict, embed_dim)\n",
306 |
"x = tokenPosLayer(input_placeholder) # call isn't run yet, just a link created\n",
307 |
308 |
"transformerBlock = TransformerBlock(seq_length, embed_dim, ff_dim)\n",
309 |
310 |
311 |
"# x contains contextualized data, now the last row of the seq_len holds\n",
312 |
"# the latest context hence it is extract out\n",
313 |
"x = x[:, -1, :]\n",
314 |
"print(x.shape) # batch_size, last_row, embed_dim\n",
315 |
316 |
"# we pass this context to a dense layer to learn how to make predictions\n",
317 |
"x = Dense(total_words_in_dict, activation='softmax')(x)\n",
318 |
"# batch_size, total_words (prediction)\n",
319 |
"# prediction happens batch wise in parallel and is compared to y\n",
320 |
"# batch wise in parallel\n",
321 |
322 |
323 |
324 |
"model = tf.keras.Model(inputs=input_placeholder, outputs=x)\n",
325 |
326 |
327 |
328 |
329 |
"cell_type": "code",
330 |
"execution_count": null,
331 |
"id": "1f60878b-6e12-4dcd-ab89-03a64e7a3367",
332 |
"metadata": {},
333 |
"outputs": [],
334 |
"source": [
335 |
"model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
336 |
337 |
338 |
339 |
"cell_type": "code",
340 |
"execution_count": null,
341 |
"id": "5e688417-13f5-41c7-b98e-c4b4330ef363",
342 |
"metadata": {},
343 |
"outputs": [],
344 |
"source": [
345 |
346 |
"import time\n",
347 |
348 |
349 |
"# CPU Benchmark\n",
350 |
"with tf.device('/CPU:0'):\n",
351 |
" start = time.time()\n",
352 |
" model.fit(X, y, batch_size=32, epochs=10)\n",
353 |
" print(\"CPU Time:\", time.time() - start)\n",
354 |
355 |
356 |
357 |
358 |
359 |
"cell_type": "code",
360 |
"execution_count": null,
361 |
"id": "7fb4552b-86cc-461c-8a9a-572f5bfd869b",
362 |
"metadata": {},
363 |
"outputs": [],
364 |
"source": [
365 |
"# # GPU Benchmark\n",
366 |
"# with tf.device('/GPU:0'):\n",
367 |
"# start = time.time()\n",
368 |
"# rnn.fit(X, y, batch_size=1024, epochs=10)\n",
369 |
"# print(\"GPU Time:\", time.time() - start)"
370 |
371 |
372 |
373 |
"cell_type": "code",
374 |
"execution_count": null,
375 |
"id": "7659d823-faf4-4908-9a0c-bd18b076c240",
376 |
"metadata": {},
377 |
"outputs": [],
378 |
"source": [
379 |
"def predict_next_word(seed_text, num_words_to_predict, max_len):\n",
380 |
" for _ in range(num_words_to_predict):\n",
381 |
" seed_list = tokenizer.texts_to_sequences([seed_text])[0]\n",
382 |
" seed_list = pad_sequences([seed_list], maxlen=max_len - 1, padding='pre')\n",
383 |
" prediction = model.predict(seed_list, verbose=0)\n",
384 |
" # prediction is an embed_dim array of probabilities\n",
385 |
" max_pred_index = np.argmax(prediction)\n",
386 |
" seed_text+= \" \" + tokenizer.index_word[max_pred_index]\n",
387 |
388 |
" return seed_text"
389 |
390 |
391 |
392 |
"cell_type": "code",
393 |
"execution_count": null,
394 |
"id": "2dbbd786-5318-4172-9542-e56658ef79ba",
395 |
"metadata": {},
396 |
"outputs": [],
397 |
"source": [
398 |
"predict_next_word(\"who is harry is a \", 25, seq_length + 1)"
399 |
400 |
401 |
402 |
"cell_type": "code",
403 |
"execution_count": null,
404 |
"id": "6751aa4b-2d22-47a2-9f17-f557f78c6f45",
405 |
"metadata": {},
406 |
"outputs": [],
407 |
"source": [
408 |
"!pip install huggingface_hub"
409 |
410 |
411 |
412 |
"cell_type": "code",
413 |
"execution_count": null,
414 |
"id": "99e980a6-8686-4dd8-b26c-25a5542451b5",
415 |
"metadata": {},
416 |
"outputs": [],
417 |
"source": [
418 |
419 |
420 |
421 |
422 |
"cell_type": "code",
423 |
"execution_count": null,
424 |
"id": "d766fad6-e4be-4b97-9617-53a03661cb41",
425 |
"metadata": {
426 |
"scrolled": true
427 |
428 |
"outputs": [],
429 |
"source": [
430 |
"from huggingface_hub import notebook_login\n",
431 |
432 |
433 |
434 |
435 |
436 |
"cell_type": "code",
437 |
"execution_count": null,
438 |
"id": "9171eefc-9952-42c6-8b00-9e7f9f6f6f58",
439 |
"metadata": {},
440 |
"outputs": [],
441 |
"source": [
442 |
"from huggingface_hub import HfApi\n",
443 |
444 |
"repo_id = \"ramanhyd99/harry-potter-transformer\"\n",
445 |
"api = HfApi()\n",
446 |
"api.create_repo(repo_id=repo_id, exist_ok=True)\n"
447 |
448 |
449 |
450 |
"cell_type": "code",
451 |
"execution_count": null,
452 |
"id": "bab81223-1667-463c-9075-9ab00958b22c",
453 |
"metadata": {},
454 |
"outputs": [],
455 |
"source": [
456 |
"# Push the model to HF中国镜像站 Hub\n",
457 |
"from huggingface_hub import upload_folder\n",
458 |
459 |
460 |
" folder_path=\"\",\n",
461 |
" path_in_repo=\".\",\n",
462 |
" repo_id=repo_id,\n",
463 |
" repo_type=\"model\"\n",
464 |
465 |
466 |
467 |
468 |
"metadata": {
469 |
"kernelspec": {
470 |
"display_name": "Python (tf-metal2)",
471 |
"language": "python",
472 |
"name": "tf-metal2"
473 |
474 |
"language_info": {
475 |
"codemirror_mode": {
476 |
"name": "ipython",
477 |
"version": 3
478 |
479 |
"file_extension": ".py",
480 |
"mimetype": "text/x-python",
481 |
"name": "python",
482 |
"nbconvert_exporter": "python",
483 |
"pygments_lexer": "ipython3",
484 |
"version": "3.9.21"
485 |
486 |
487 |
"nbformat": 4,
488 |
"nbformat_minor": 5
489 |
The diff for this file is too large to render.
See raw diff
@@ -0,0 +1,3 @@
1 |
version https://git-lfs.github.com/spec/v1
2 |
oid sha256:653c6ee2695012b436f255ea782a2feb81483cef09ad86bbf616dcfbd3d9ae2f
3 |
size 41198299
@@ -0,0 +1,489 @@
1 |
2 |
"cells": [
3 |
4 |
"cell_type": "code",
5 |
"execution_count": 2,
6 |
"id": "7c710f0a-59f2-445c-9464-d702fe44fe7a",
7 |
"metadata": {},
8 |
"outputs": [
9 |
10 |
"name": "stdout",
11 |
"output_type": "stream",
12 |
"text": [
13 |
"Num GPUs Available: 1\n"
14 |
15 |
16 |
17 |
"source": [
18 |
"import tensorflow as tf\n",
19 |
"print(\"Num GPUs Available:\", len(tf.config.list_physical_devices('GPU')))"
20 |
21 |
22 |
23 |
"cell_type": "code",
24 |
"execution_count": 3,
25 |
"id": "33d41ac0-0a70-4b7f-9c00-5b1bcbcd1c9d",
26 |
"metadata": {},
27 |
"outputs": [],
28 |
"source": [
29 |
"import numpy as np\n",
30 |
"import tensorflow as tf\n",
31 |
"from tensorflow.keras.preprocessing.text import Tokenizer"
32 |
33 |
34 |
35 |
"cell_type": "code",
36 |
"execution_count": 4,
37 |
"id": "2e794897-5d68-44e5-bc1a-111a6232ce26",
38 |
"metadata": {},
39 |
"outputs": [
40 |
41 |
"name": "stdout",
42 |
"output_type": "stream",
43 |
"text": [
44 |
45 |
46 |
47 |
48 |
"source": [
49 |
"import sys\n",
50 |
51 |
52 |
53 |
54 |
"cell_type": "code",
55 |
"execution_count": 5,
56 |
"id": "8c8b6b39-3b6a-4e85-b446-2c5acacbd3e0",
57 |
"metadata": {},
58 |
"outputs": [
59 |
60 |
"ename": "FileNotFoundError",
61 |
"evalue": "[Errno 2] No such file or directory: '1.txt'",
62 |
"output_type": "error",
63 |
"traceback": [
64 |
65 |
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
66 |
"Cell \u001b[0;32mIn[5], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m data \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n\u001b[0;32m----> 6\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mload_data\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m1.txt\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mlower()\n",
67 |
"Cell \u001b[0;32mIn[5], line 2\u001b[0m, in \u001b[0;36mload_data\u001b[0;34m(file_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload_data\u001b[39m(file_path):\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 3\u001b[0m data \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
68 |
"File \u001b[0;32m/opt/miniconda3/envs/tf-metal2/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 305\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 307\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 308\u001b[0m )\n\u001b[0;32m--> 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
69 |
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '1.txt'"
70 |
71 |
72 |
73 |
"source": [
74 |
"def load_data(file_path):\n",
75 |
" with open(file_path, 'r') as f:\n",
76 |
" data = f.read()\n",
77 |
" return data\n",
78 |
79 |
"data = load_data('data.txt').lower()"
80 |
81 |
82 |
83 |
"cell_type": "code",
84 |
"execution_count": null,
85 |
"id": "573b0963-aa70-44de-86ab-33ba19d5148a",
86 |
"metadata": {},
87 |
"outputs": [],
88 |
"source": [
89 |
"tokenizer = Tokenizer(oov_token='<OOV>')\n",
90 |
91 |
"total_words_in_dict = len(tokenizer.word_index) + 1\n",
92 |
93 |
94 |
95 |
96 |
"cell_type": "code",
97 |
"execution_count": null,
98 |
"id": "cda10ef1-d1c2-4025-b66f-7d2325526df9",
99 |
"metadata": {},
100 |
"outputs": [],
101 |
"source": [
102 |
"tokenizer.word_index['<OOV>'], tokenizer.word_index['harry']"
103 |
104 |
105 |
106 |
"cell_type": "code",
107 |
"execution_count": null,
108 |
"id": "8d52769c-58d8-4ea2-a4e0-9664d5a2da9d",
109 |
"metadata": {},
110 |
"outputs": [],
111 |
"source": [
112 |
"# tokens basically is the entire text from first to last converted into their\n",
113 |
"# index representation\n",
114 |
"tokens = tokenizer.texts_to_sequences([data])[0]"
115 |
116 |
117 |
118 |
"cell_type": "code",
119 |
"execution_count": null,
120 |
"id": "03976234-376f-4b24-bab0-a7040c6760a3",
121 |
"metadata": {},
122 |
"outputs": [],
123 |
"source": [
124 |
"# this creates lists of length 51 (seq_len + 1)\n",
125 |
"# 1-51, 2-52, 3-53, etc.\n",
126 |
"# 51 so that the last value is used as y\n",
127 |
"seq_length = 50\n",
128 |
"input_sequences = []\n",
129 |
"for i in range(seq_length, len(tokens)):\n",
130 |
" input_sequences.append(tokens[i - seq_length: i + 1])"
131 |
132 |
133 |
134 |
"cell_type": "code",
135 |
"execution_count": null,
136 |
"id": "e49c6da4-64c0-4bc7-9526-6f3df699002a",
137 |
"metadata": {},
138 |
"outputs": [],
139 |
"source": [
140 |
"# this ensures all the lists are of same length\n",
141 |
"# here as well we need seq_len + 1 as the previous block\n",
142 |
"from tensorflow.keras.utils import pad_sequences\n",
143 |
144 |
"final_input = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))\n",
145 |
146 |
147 |
148 |
149 |
"cell_type": "code",
150 |
"execution_count": null,
151 |
"id": "83639aac-6ad1-4494-ac0c-b54a59e39025",
152 |
"metadata": {},
153 |
"outputs": [],
154 |
"source": [
155 |
"# create x and y, last value of each list is the prediction\n",
156 |
"# imagine sliding window\n",
157 |
"X, y = final_input[:, :-1], final_input[:, -1]\n",
158 |
"print('X : ', X[0], 'Y: ', y[0])"
159 |
160 |
161 |
162 |
"cell_type": "code",
163 |
"execution_count": null,
164 |
"id": "70d67b34-401d-425a-bc37-3b58863ccc4c",
165 |
"metadata": {},
166 |
"outputs": [],
167 |
"source": [
168 |
"# if you print y, it will be integer values like 46, 274, etc.\n",
169 |
"# we need categorical, also it can belong to any word from the entire\n",
170 |
"# dict , we will generate probs and find crossentropy\n",
171 |
"y = tf.keras.utils.to_categorical(y, num_classes=total_words_in_dict)\n",
172 |
"y[0], y.shape"
173 |
174 |
175 |
176 |
"cell_type": "code",
177 |
"execution_count": null,
178 |
"id": "a8678c88-b1fa-4d0b-9be6-e3fb27413d17",
179 |
"metadata": {},
180 |
"outputs": [],
181 |
"source": [
182 |
"# the shape will be number of lists x seq_len\n",
183 |
"X.shape, y.shape"
184 |
185 |
186 |
187 |
"cell_type": "code",
188 |
"execution_count": null,
189 |
"id": "3b03bba6-6819-4282-b2e2-5d5219905eda",
190 |
"metadata": {},
191 |
"outputs": [],
192 |
"source": [
193 |
"from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding\n",
194 |
195 |
"class MultiHeadAttention(Layer):\n",
196 |
" def __init__(self, seq_length, num_heads, embed_dim):\n",
197 |
" super(MultiHeadAttention, self).__init__()\n",
198 |
199 |
" self.seq_length = seq_length\n",
200 |
" self.num_heads = num_heads\n",
201 |
" self.embed_dim = embed_dim\n",
202 |
203 |
" self.projection_dim = embed_dim // num_heads\n",
204 |
205 |
" self.query = Dense(embed_dim)\n",
206 |
" self.key = Dense(embed_dim)\n",
207 |
" self.value = Dense(embed_dim)\n",
208 |
209 |
" # need this to learn the interaction between the features learnt by all\n",
210 |
" # the different heads\n",
211 |
" self.combine_heads_layer = Dense(embed_dim)\n",
212 |
213 |
" def split_heads(self, input):\n",
214 |
" batch_size = tf.shape(input)[0]\n",
215 |
" x = tf.reshape(input, (batch_size, -1, self.num_heads, self.projection_dim))\n",
216 |
" return tf.transpose(x, perm=[0, 2, 1, 3])\n",
217 |
218 |
" def self_attention(self, query, key, value):\n",
219 |
" score = tf.matmul(query, key, transpose_b=True)\n",
220 |
" scaled_score = score / tf.math.sqrt(tf.cast(self.projection_dim, tf.float32))\n",
221 |
" weights = tf.nn.softmax(scaled_score, axis=-1) # row wise in QKt\n",
222 |
223 |
" return tf.matmul(weights, value), weights\n",
224 |
225 |
226 |
" def call(self, x):\n",
227 |
" batch_size = tf.shape(x)[0]\n",
228 |
229 |
" # finds the weights matrix then split across heads\n",
230 |
" # it is more efficient computationally if we find the weight matrix\n",
231 |
" # across all the heads first then split to find individual attention scores\n",
232 |
" query = self.split_heads(self.query(x))\n",
233 |
" key = self.split_heads(self.key(x))\n",
234 |
" value = self.split_heads(self.value(x))\n",
235 |
236 |
" attention, _ = self.self_attention(query, key, value)\n",
237 |
" # attention is of size [batch_size, num_heads, seq_length, proj_dim]\n",
238 |
239 |
" attention = tf.transpose(attention, perm=[0, 2, 1, 3])\n",
240 |
" # attention is of size [batch_size, seq_length, num_heads, proj_dim]\n",
241 |
242 |
" concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))\n",
243 |
244 |
" return self.combine_heads_layer(concat_attention)\n",
245 |
246 |
247 |
248 |
"class TransformerBlock(Layer):\n",
249 |
" def __init__(self, seq_length, embed_dim, ffn_dim):\n",
250 |
" super(TransformerBlock, self).__init__()\n",
251 |
252 |
" self.seq_length = seq_length\n",
253 |
" self.embed_dim = embed_dim\n",
254 |
" self.ffn = tf.keras.Sequential([\n",
255 |
" Dense(ffn_dim, activation='relu'),\n",
256 |
" Dense(embed_dim)\n",
257 |
" ])\n",
258 |
259 |
" self.attn = MultiHeadAttention(seq_length, 8, embed_dim)\n",
260 |
261 |
" self.LayerNorm1 = LayerNormalization(epsilon=1e-6) # prevent divide by 0\n",
262 |
" self.LayerNorm2 = LayerNormalization(epsilon=1e-6)\n",
263 |
264 |
" self.Drop1 = Dropout(0.1)\n",
265 |
" self.Drop2 = Dropout(0.1)\n",
266 |
267 |
268 |
" def call(self, x, isTraining):\n",
269 |
" attention_output = self.attn(x)\n",
270 |
" print(attention_output.shape)\n",
271 |
" x = self.LayerNorm1(x + self.Drop1(attention_output, training=isTraining))\n",
272 |
" ffn_output = self.ffn(x)\n",
273 |
" x = self.LayerNorm2(x + self.Drop2(ffn_output, training=isTraining))\n",
274 |
" return x\n",
275 |
276 |
"class TokenAndPositionEmbedding(Layer):\n",
277 |
" def __init__(self, seq_length, total_words_in_dict, embed_dim):\n",
278 |
" super(TokenAndPositionEmbedding, self).__init__()\n",
279 |
280 |
" self.seq_length = seq_length\n",
281 |
" self.emb = Embedding(input_dim=total_words_in_dict, output_dim=embed_dim)\n",
282 |
" self.pos_emb = Embedding(input_dim=seq_length, output_dim=embed_dim)\n",
283 |
284 |
" def call(self, x):\n",
285 |
" positions = tf.range(start=0, limit=self.seq_length, delta=1)\n",
286 |
" positions = self.pos_emb(positions)\n",
287 |
" x = self.emb(x)\n",
288 |
" return x + positions"
289 |
290 |
291 |
292 |
"cell_type": "code",
293 |
"execution_count": null,
294 |
"id": "19e33d70-6984-44c2-bfe2-f525444bdf01",
295 |
"metadata": {},
296 |
"outputs": [],
297 |
"source": [
298 |
"ff_dim = 512\n",
299 |
"embed_dim = 256\n",
300 |
301 |
" # This is a placeholder in functional api style\n",
302 |
" # batch_size is taken during .fit() phase\n",
303 |
"input_placeholder = tf.keras.Input(shape=(seq_length,))\n",
304 |
305 |
"tokenPosLayer = TokenAndPositionEmbedding(seq_length, total_words_in_dict, embed_dim)\n",
306 |
"x = tokenPosLayer(input_placeholder) # call isn't run yet, just a link created\n",
307 |
308 |
"transformerBlock = TransformerBlock(seq_length, embed_dim, ff_dim)\n",
309 |
310 |
311 |
"# x contains contextualized data, now the last row of the seq_len holds\n",
312 |
"# the latest context hence it is extract out\n",
313 |
"x = x[:, -1, :]\n",
314 |
"print(x.shape) # batch_size, last_row, embed_dim\n",
315 |
316 |
"# we pass this context to a dense layer to learn how to make predictions\n",
317 |
"x = Dense(total_words_in_dict, activation='softmax')(x)\n",
318 |
"# batch_size, total_words (prediction)\n",
319 |
"# prediction happens batch wise in parallel and is compared to y\n",
320 |
"# batch wise in parallel\n",
321 |
322 |
323 |
324 |
"model = tf.keras.Model(inputs=input_placeholder, outputs=x)\n",
325 |
326 |
327 |
328 |
329 |
"cell_type": "code",
330 |
"execution_count": null,
331 |
"id": "1f60878b-6e12-4dcd-ab89-03a64e7a3367",
332 |
"metadata": {},
333 |
"outputs": [],
334 |
"source": [
335 |
"model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
336 |
337 |
338 |
339 |
"cell_type": "code",
340 |
"execution_count": null,
341 |
"id": "5e688417-13f5-41c7-b98e-c4b4330ef363",
342 |
"metadata": {},
343 |
"outputs": [],
344 |
"source": [
345 |
346 |
"import time\n",
347 |
348 |
349 |
"# CPU Benchmark\n",
350 |
"with tf.device('/CPU:0'):\n",
351 |
" start = time.time()\n",
352 |
" model.fit(X, y, batch_size=32, epochs=10)\n",
353 |
" print(\"CPU Time:\", time.time() - start)\n",
354 |
355 |
356 |
357 |
358 |
359 |
"cell_type": "code",
360 |
"execution_count": null,
361 |
"id": "7fb4552b-86cc-461c-8a9a-572f5bfd869b",
362 |
"metadata": {},
363 |
"outputs": [],
364 |
"source": [
365 |
"# # GPU Benchmark\n",
366 |
"# with tf.device('/GPU:0'):\n",
367 |
"# start = time.time()\n",
368 |
"# rnn.fit(X, y, batch_size=1024, epochs=10)\n",
369 |
"# print(\"GPU Time:\", time.time() - start)"
370 |
371 |
372 |
373 |
"cell_type": "code",
374 |
"execution_count": null,
375 |
"id": "7659d823-faf4-4908-9a0c-bd18b076c240",
376 |
"metadata": {},
377 |
"outputs": [],
378 |
"source": [
379 |
"def predict_next_word(seed_text, num_words_to_predict, max_len):\n",
380 |
" for _ in range(num_words_to_predict):\n",
381 |
" seed_list = tokenizer.texts_to_sequences([seed_text])[0]\n",
382 |
" seed_list = pad_sequences([seed_list], maxlen=max_len - 1, padding='pre')\n",
383 |
" prediction = model.predict(seed_list, verbose=0)\n",
384 |
" # prediction is an embed_dim array of probabilities\n",
385 |
" max_pred_index = np.argmax(prediction)\n",
386 |
" seed_text+= \" \" + tokenizer.index_word[max_pred_index]\n",
387 |
388 |
" return seed_text"
389 |
390 |
391 |
392 |
"cell_type": "code",
393 |
"execution_count": null,
394 |
"id": "2dbbd786-5318-4172-9542-e56658ef79ba",
395 |
"metadata": {},
396 |
"outputs": [],
397 |
"source": [
398 |
"predict_next_word(\"who is harry is a \", 25, seq_length + 1)"
399 |
400 |
401 |
402 |
"cell_type": "code",
403 |
"execution_count": null,
404 |
"id": "6751aa4b-2d22-47a2-9f17-f557f78c6f45",
405 |
"metadata": {},
406 |
"outputs": [],
407 |
"source": [
408 |
"!pip install huggingface_hub"
409 |
410 |
411 |
412 |
"cell_type": "code",
413 |
"execution_count": null,
414 |
"id": "99e980a6-8686-4dd8-b26c-25a5542451b5",
415 |
"metadata": {},
416 |
"outputs": [],
417 |
"source": [
418 |
419 |
420 |
421 |
422 |
"cell_type": "code",
423 |
"execution_count": null,
424 |
"id": "d766fad6-e4be-4b97-9617-53a03661cb41",
425 |
"metadata": {
426 |
"scrolled": true
427 |
428 |
"outputs": [],
429 |
"source": [
430 |
"from huggingface_hub import notebook_login\n",
431 |
432 |
433 |
434 |
435 |
436 |
"cell_type": "code",
437 |
"execution_count": null,
438 |
"id": "9171eefc-9952-42c6-8b00-9e7f9f6f6f58",
439 |
"metadata": {},
440 |
"outputs": [],
441 |
"source": [
442 |
"from huggingface_hub import HfApi\n",
443 |
444 |
"repo_id = \"ramanhyd99/harry-potter-transformer\"\n",
445 |
"api = HfApi()\n",
446 |
"api.create_repo(repo_id=repo_id, exist_ok=True)\n"
447 |
448 |
449 |
450 |
"cell_type": "code",
451 |
"execution_count": null,
452 |
"id": "bab81223-1667-463c-9075-9ab00958b22c",
453 |
"metadata": {},
454 |
"outputs": [],
455 |
"source": [
456 |
"# Push the model to HF中国镜像站 Hub\n",
457 |
"from huggingface_hub import upload_folder\n",
458 |
459 |
460 |
" folder_path=\"\",\n",
461 |
" path_in_repo=\".\",\n",
462 |
" repo_id=repo_id,\n",
463 |
" repo_type=\"model\"\n",
464 |
465 |
466 |
467 |
468 |
"metadata": {
469 |
"kernelspec": {
470 |
"display_name": "Python (tf-metal2)",
471 |
"language": "python",
472 |
"name": "tf-metal2"
473 |
474 |
"language_info": {
475 |
"codemirror_mode": {
476 |
"name": "ipython",
477 |
"version": 3
478 |
479 |
"file_extension": ".py",
480 |
"mimetype": "text/x-python",
481 |
"name": "python",
482 |
"nbconvert_exporter": "python",
483 |
"pygments_lexer": "ipython3",
484 |
"version": "3.9.21"
485 |
486 |
487 |
"nbformat": 4,
488 |
"nbformat_minor": 5
489 |