In [2]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
import sys
print(sys.executable)

/opt/miniconda3/envs/tf-metal2/bin/python


In [5]:
def load_data(file_path):
 with open(file_path, 'r') as f:
 data = f.read()
 return data

data = load_data('data.txt').lower()

FileNotFoundError: [Errno 2] No such file or directory: '1.txt'

In [None]:
tokenizer = Tokenizer(oov_token='')
tokenizer.fit_on_texts([data])
total_words_in_dict = len(tokenizer.word_index) + 1
total_words_in_dict

In [None]:
tokenizer.word_index[''], tokenizer.word_index['harry']

In [None]:
# tokens basically is the entire text from first to last converted into their
# index representation
tokens = tokenizer.texts_to_sequences([data])[0]

In [None]:
# this creates lists of length 51 (seq_len + 1)
# 1-51, 2-52, 3-53, etc.
# 51 so that the last value is used as y
seq_length = 50
input_sequences = []
for i in range(seq_length, len(tokens)):
 input_sequences.append(tokens[i - seq_length: i + 1])

In [None]:
# this ensures all the lists are of same length
# here as well we need seq_len + 1 as the previous block
from tensorflow.keras.utils import pad_sequences

final_input = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
final_input[0]

In [None]:
# create x and y, last value of each list is the prediction
# imagine sliding window
X, y = final_input[:, :-1], final_input[:, -1]
print('X : ', X[0], 'Y: ', y[0])

In [None]:
# if you print y, it will be integer values like 46, 274, etc.
# we need categorical, also it can belong to any word from the entire
# dict , we will generate probs and find crossentropy
y = tf.keras.utils.to_categorical(y, num_classes=total_words_in_dict)
y[0], y.shape

In [None]:
# the shape will be number of lists x seq_len
X.shape, y.shape

In [None]:
from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding

class MultiHeadAttention(Layer):
 def __init__(self, seq_length, num_heads, embed_dim):
 super(MultiHeadAttention, self).__init__()

 self.seq_length = seq_length
 self.num_heads = num_heads
 self.embed_dim = embed_dim

 self.projection_dim = embed_dim // num_heads

 self.query = Dense(embed_dim)
 self.key = Dense(embed_dim)
 self.value = Dense(embed_dim)

 # need this to learn the interaction between the features learnt by all
 # the different heads
 self.combine_heads_layer = Dense(embed_dim)

 def split_heads(self, input):
 batch_size = tf.shape(input)[0]
 x = tf.reshape(input, (batch_size, -1, self.num_heads, self.projection_dim))
 return tf.transpose(x, perm=[0, 2, 1, 3])

 def self_attention(self, query, key, value):
 score = tf.matmul(query, key, transpose_b=True)
 scaled_score = score / tf.math.sqrt(tf.cast(self.projection_dim, tf.float32))
 weights = tf.nn.softmax(scaled_score, axis=-1) # row wise in QKt

 return tf.matmul(weights, value), weights


 def call(self, x):
 batch_size = tf.shape(x)[0]

 # finds the weights matrix then split across heads
 # it is more efficient computationally if we find the weight matrix
 # across all the heads first then split to find individual attention scores
 query = self.split_heads(self.query(x))
 key = self.split_heads(self.key(x))
 value = self.split_heads(self.value(x))

 attention, _ = self.self_attention(query, key, value)
 # attention is of size [batch_size, num_heads, seq_length, proj_dim]

 attention = tf.transpose(attention, perm=[0, 2, 1, 3])
 # attention is of size [batch_size, seq_length, num_heads, proj_dim]

 concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))

 return self.combine_heads_layer(concat_attention)



class TransformerBlock(Layer):
 def __init__(self, seq_length, embed_dim, ffn_dim):
 super(TransformerBlock, self).__init__()

 self.seq_length = seq_length
 self.embed_dim = embed_dim
 self.ffn = tf.keras.Sequential([
 Dense(ffn_dim, activation='relu'),
 Dense(embed_dim)
 ])

 self.attn = MultiHeadAttention(seq_length, 8, embed_dim)

 self.LayerNorm1 = LayerNormalization(epsilon=1e-6) # prevent divide by 0
 self.LayerNorm2 = LayerNormalization(epsilon=1e-6)

 self.Drop1 = Dropout(0.1)
 self.Drop2 = Dropout(0.1)


 def call(self, x, isTraining):
 attention_output = self.attn(x)
 print(attention_output.shape)
 x = self.LayerNorm1(x + self.Drop1(attention_output, training=isTraining))
 ffn_output = self.ffn(x)
 x = self.LayerNorm2(x + self.Drop2(ffn_output, training=isTraining))
 return x

class TokenAndPositionEmbedding(Layer):
 def __init__(self, seq_length, total_words_in_dict, embed_dim):
 super(TokenAndPositionEmbedding, self).__init__()

 self.seq_length = seq_length
 self.emb = Embedding(input_dim=total_words_in_dict, output_dim=embed_dim)
 self.pos_emb = Embedding(input_dim=seq_length, output_dim=embed_dim)

 def call(self, x):
 positions = tf.range(start=0, limit=self.seq_length, delta=1)
 positions = self.pos_emb(positions)
 x = self.emb(x)
 return x + positions

In [None]:
ff_dim = 512
embed_dim = 256

 # This is a placeholder in functional api style
 # batch_size is taken during .fit() phase
input_placeholder = tf.keras.Input(shape=(seq_length,))
input_placeholder.shape
tokenPosLayer = TokenAndPositionEmbedding(seq_length, total_words_in_dict, embed_dim)
x = tokenPosLayer(input_placeholder) # call isn't run yet, just a link created

transformerBlock = TransformerBlock(seq_length, embed_dim, ff_dim)
print(x.shape)

# x contains contextualized data, now the last row of the seq_len holds
# the latest context hence it is extract out
x = x[:, -1, :]
print(x.shape) # batch_size, last_row, embed_dim

# we pass this context to a dense layer to learn how to make predictions
x = Dense(total_words_in_dict, activation='softmax')(x)
# batch_size, total_words (prediction)
# prediction happens batch wise in parallel and is compared to y
# batch wise in parallel

print(x.shape)

model = tf.keras.Model(inputs=input_placeholder, outputs=x)
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:

import time


# CPU Benchmark
with tf.device('/CPU:0'):
 start = time.time()
 model.fit(X, y, batch_size=32, epochs=10)
 print("CPU Time:", time.time() - start)




In [None]:
# # GPU Benchmark
# with tf.device('/GPU:0'):
# start = time.time()
# rnn.fit(X, y, batch_size=1024, epochs=10)
# print("GPU Time:", time.time() - start)

In [None]:
def predict_next_word(seed_text, num_words_to_predict, max_len):
 for _ in range(num_words_to_predict):
 seed_list = tokenizer.texts_to_sequences([seed_text])[0]
 seed_list = pad_sequences([seed_list], maxlen=max_len - 1, padding='pre')
 prediction = model.predict(seed_list, verbose=0)
 # prediction is an embed_dim array of probabilities
 max_pred_index = np.argmax(prediction)
 seed_text+= " " + tokenizer.index_word[max_pred_index]

 return seed_text

In [None]:
predict_next_word("who is harry is a ", 25, seq_length + 1)

In [None]:
!pip install huggingface_hub

In [None]:
model.save("harry_potter_transformer.keras")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from huggingface_hub import HfApi

repo_id = "ramanhyd99/harry-potter-transformer"
api = HfApi()
api.create_repo(repo_id=repo_id, exist_ok=True)


In [None]:
# Push the model to HF中国镜像站 Hub
from huggingface_hub import upload_folder

upload_folder(
 folder_path="",
 path_in_repo=".",
 repo_id=repo_id,
 repo_type="model"
)