1+ {
2+ "cells" : [
3+ {
4+ "cell_type" : " code" ,
5+ "execution_count" : null ,
6+ "source" : [
7+ " import tensorflow as tf\r\n " ,
8+ " import numpy as np\r\n " ,
9+ " import os\r\n " ,
10+ " import pickle\r\n " ,
11+ " \r\n " ,
12+ " SEQUENCE_LENGTH = 50\r\n " ,
13+ " EMBEDDING_DIM = 200\r\n " ,
14+ " BATCH_SIZE = 128\r\n " ,
15+ " FILE_PATH = \" data/python_code.py\"\r\n " ,
16+ " BASENAME = os.path.basename(FILE_PATH) + \" -lower\"\r\n " ,
17+ " \r\n " ,
18+ " text = open(FILE_PATH).read()\r\n " ,
19+ " # comment this if you want to use uppercase letters\r\n " ,
20+ " text = text.lower()\r\n " ,
21+ " n_chars = len(text)\r\n " ,
22+ " vocab = ''.join(sorted(set(text)))\r\n " ,
23+ " print(\" vocab:\" , vocab)\r\n " ,
24+ " n_unique_chars = len(vocab)\r\n " ,
25+ " print(\" Number of characters:\" , n_chars)\r\n " ,
26+ " print(\" Number of unique characters:\" , n_unique_chars)"
27+ ],
28+ "outputs" : [],
29+ "metadata" : {}
30+ },
31+ {
32+ "cell_type" : " code" ,
33+ "execution_count" : null ,
34+ "source" : [
35+ " # dictionary that converts characters to integers\r\n " ,
36+ " char2int = {c: i for i, c in enumerate(vocab)}\r\n " ,
37+ " # dictionary that converts integers to characters\r\n " ,
38+ " int2char = {i: c for i, c in enumerate(vocab)}\r\n " ,
39+ " \r\n " ,
40+ " # save these dictionaries for later generation\r\n " ,
41+ " pickle.dump(char2int, open(f\" {BASENAME}-char2int.pickle\" , \" wb\" ))\r\n " ,
42+ " pickle.dump(int2char, open(f\" {BASENAME}-int2char.pickle\" , \" wb\" ))"
43+ ],
44+ "outputs" : [],
45+ "metadata" : {}
46+ },
47+ {
48+ "cell_type" : " code" ,
49+ "execution_count" : null ,
50+ "source" : [
51+ " encoded_text = np.array([char2int[c] for c in text])"
52+ ],
53+ "outputs" : [],
54+ "metadata" : {}
55+ },
56+ {
57+ "cell_type" : " code" ,
58+ "execution_count" : null ,
59+ "source" : [
60+ " char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)\r\n " ,
61+ " for element in char_dataset.take(5):\r\n " ,
62+ " print(element.numpy())"
63+ ],
64+ "outputs" : [],
65+ "metadata" : {}
66+ },
67+ {
68+ "cell_type" : " code" ,
69+ "execution_count" : null ,
70+ "source" : [
71+ " for element in char_dataset.batch(SEQUENCE_LENGTH+1).shuffle(1024).take(2):\r\n " ,
72+ " print(''.join([int2char[c] for c in element.numpy()]))"
73+ ],
74+ "outputs" : [],
75+ "metadata" : {}
76+ },
77+ {
78+ "cell_type" : " code" ,
79+ "execution_count" : null ,
80+ "source" : [
81+ " #help(tf.one_hot)\r\n " ,
82+ " #help(char_dataset.window)\r\n " ,
83+ " windows = char_dataset.window(SEQUENCE_LENGTH+1, shift=1, drop_remainder=True)\r\n " ,
84+ " sequences = windows.flat_map(lambda window: window.batch(SEQUENCE_LENGTH+1))\r\n " ,
85+ " dataset = sequences.map(lambda x: (x[:-1], x[-1]))\r\n " ,
86+ " for input_, target in dataset.take(10):\r\n " ,
87+ " print(input_.numpy().shape)\r\n " ,
88+ " print(target.numpy().shape)\r\n " ,
89+ " print(''.join([int2char[c] for c in input_.numpy()]), int2char[target.numpy()])\r\n " ,
90+ " print(\" =\" *50)"
91+ ],
92+ "outputs" : [],
93+ "metadata" : {}
94+ },
95+ {
96+ "cell_type" : " code" ,
97+ "execution_count" : null ,
98+ "source" : [
99+ " sequences2 = char_dataset.batch(2*SEQUENCE_LENGTH+1, drop_remainder=True)\r\n " ,
100+ " \r\n " ,
101+ " def split_sample(sample):\r\n " ,
102+ " ds = tf.data.Dataset.from_tensors((sample[:SEQUENCE_LENGTH], sample[SEQUENCE_LENGTH]))\r\n " ,
103+ " for i in range(1, (len(sample)-1) // 2):\r\n " ,
104+ " input_ = sample[i:i+SEQUENCE_LENGTH]\r\n " ,
105+ " target = sample[i+SEQUENCE_LENGTH]\r\n " ,
106+ " other_ds = tf.data.Dataset.from_tensors((input_, target))\r\n " ,
107+ " ds = ds.concatenate(other_ds)\r\n " ,
108+ " return ds\r\n " ,
109+ " \r\n " ,
110+ " \r\n " ,
111+ " dataset2 = sequences2.flat_map(split_sample)\r\n " ,
112+ " for element in dataset2.take(10):\r\n " ,
113+ " print(element[0].shape, element[1].shape)\r\n " ,
114+ " print(''.join([int2char[c] for c in element[0].numpy()]), int2char[element[1].numpy()])"
115+ ],
116+ "outputs" : [],
117+ "metadata" : {
118+ "tags" : [
119+ " outputPrepend" ,
120+ " outputPrepend" ,
121+ " outputPrepend" ,
122+ " outputPrepend"
123+ ]
124+ }
125+ },
126+ {
127+ "cell_type" : " code" ,
128+ "execution_count" : null ,
129+ "source" : [
130+ " for element1, element2 in zip(dataset.take(5), dataset2.take(5)):\r\n " ,
131+ " print(element1[0].numpy() == element2[0].numpy())\r\n " ,
132+ " "
133+ ],
134+ "outputs" : [],
135+ "metadata" : {}
136+ },
137+ {
138+ "cell_type" : " code" ,
139+ "execution_count" : null ,
140+ "source" : [
141+ " def one_hot_samples(input_, target):\r\n " ,
142+ " return tf.one_hot(input_, len(vocab)), tf.one_hot(target, len(vocab))\r\n " ,
143+ " # return input_, tf.one_hot(target, len(vocab))\r\n " ,
144+ " \r\n " ,
145+ " dataset = dataset.map(one_hot_samples)\r\n " ,
146+ " dataset2 = dataset2.map(one_hot_samples)\r\n " ,
147+ " for element in dataset.take(10):\r\n " ,
148+ " print(element[0].shape, element[1].shape)"
149+ ],
150+ "outputs" : [],
151+ "metadata" : {}
152+ },
153+ {
154+ "cell_type" : " code" ,
155+ "execution_count" : null ,
156+ "source" : [
157+ " ds = dataset.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()\r\n " ,
158+ " ds2 = dataset2.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()"
159+ ],
160+ "outputs" : [],
161+ "metadata" : {}
162+ },
163+ {
164+ "cell_type" : " code" ,
165+ "execution_count" : null ,
166+ "source" : [
167+ " def create_model(vocab_size, embedding_dim, rnn_units, batch_size):\r\n " ,
168+ " model = tf.keras.Sequential()\r\n " ,
169+ " # model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(SEQUENCE_LENGTH,)))\r\n " ,
170+ " model.add(tf.keras.layers.LSTM(rnn_units, input_shape=(SEQUENCE_LENGTH, len(vocab)), return_sequences=True))\r\n " ,
171+ " model.add(tf.keras.layers.Dropout(0.3))\r\n " ,
172+ " model.add(tf.keras.layers.LSTM(rnn_units)),\r\n " ,
173+ " model.add(tf.keras.layers.Dropout(0.3))\r\n " ,
174+ " model.add(tf.keras.layers.Dense(vocab_size, activation=\" softmax\" ))\r\n " ,
175+ " return model"
176+ ],
177+ "outputs" : [],
178+ "metadata" : {}
179+ },
180+ {
181+ "cell_type" : " code" ,
182+ "execution_count" : null ,
183+ "source" : [
184+ " model = create_model(len(vocab), embedding_dim=EMBEDDING_DIM, rnn_units=128, batch_size=BATCH_SIZE)\r\n " ,
185+ " model.summary()\r\n " ,
186+ " model.compile(optimizer=\" adam\" , loss=\" categorical_crossentropy\" , metrics=[\" accuracy\" ])"
187+ ],
188+ "outputs" : [],
189+ "metadata" : {}
190+ },
191+ {
192+ "cell_type" : " code" ,
193+ "execution_count" : null ,
194+ "source" : [
195+ " EPOCHS = 5\r\n " ,
196+ " history = model.fit(ds2, steps_per_epoch=(len(encoded_text) - SEQUENCE_LENGTH ) // BATCH_SIZE, epochs=EPOCHS)"
197+ ],
198+ "outputs" : [],
199+ "metadata" : {}
200+ },
201+ {
202+ "cell_type" : " code" ,
203+ "execution_count" : null ,
204+ "source" : [
205+ " # save the model\r\n " ,
206+ " model_path = f\" results/{BASENAME}-{SEQUENCE_LENGTH}-NOEMBEDDING-moredata.h5\"\r\n " ,
207+ " model.save(model_path)\r\n " ,
208+ " # model.load_weights(model_path)"
209+ ],
210+ "outputs" : [],
211+ "metadata" : {}
212+ },
213+ {
214+ "cell_type" : " code" ,
215+ "execution_count" : null ,
216+ "source" : [
217+ " seed = \"\"\" You can be a\"\"\" .lower()\r\n " ,
218+ " s = seed\r\n " ,
219+ " # generate 400 characters\r\n " ,
220+ " generated = \"\"\r\n " ,
221+ " for i in range(200):\r\n " ,
222+ " # make the input sequence\r\n " ,
223+ " X = np.zeros((1, SEQUENCE_LENGTH, len(vocab)))\r\n " ,
224+ " # X = np.zeros((1, SEQUENCE_LENGTH))\r\n " ,
225+ " for t, char in enumerate(seed):\r\n " ,
226+ " X[0, (SEQUENCE_LENGTH - len(seed)) + t, char2int[char]] = 1\r\n " ,
227+ " # predict the next character\r\n " ,
228+ " predicted = model.predict(X, verbose=0)[0]\r\n " ,
229+ " # print(predicted)\r\n " ,
230+ " # converting the vector to an integer\r\n " ,
231+ " next_index = np.argmax(predicted)\r\n " ,
232+ " # next_index = np.squeeze(np.round(predicted))\r\n " ,
233+ " # converting the integer to a character\r\n " ,
234+ " # print(next_index)\r\n " ,
235+ " next_char = int2char[next_index]\r\n " ,
236+ " # add the character to results\r\n " ,
237+ " generated += next_char\r\n " ,
238+ " # shift seed and the predicted character\r\n " ,
239+ " seed = seed[1:] + next_char\r\n " ,
240+ " \r\n " ,
241+ " print(\" Generated text:\" )\r\n " ,
242+ " print(s + generated)"
243+ ],
244+ "outputs" : [],
245+ "metadata" : {}
246+ },
247+ {
248+ "cell_type" : " code" ,
249+ "execution_count" : null ,
250+ "source" : [
251+ " char2int\r\n "
252+ ],
253+ "outputs" : [],
254+ "metadata" : {}
255+ },
256+ {
257+ "cell_type" : " code" ,
258+ "execution_count" : null ,
259+ "source" : [],
260+ "outputs" : [],
261+ "metadata" : {}
262+ },
263+ {
264+ "cell_type" : " code" ,
265+ "execution_count" : null ,
266+ "source" : [],
267+ "outputs" : [],
268+ "metadata" : {}
269+ }
270+ ],
271+ "metadata" : {
272+ "file_extension" : " .py" ,
273+ "kernelspec" : {
274+ "name" : " python3" ,
275+ "display_name" : " Python 3.8.7 64-bit"
276+ },
277+ "language_info" : {
278+ "codemirror_mode" : {
279+ "name" : " ipython" ,
280+ "version" : 3
281+ },
282+ "file_extension" : " .py" ,
283+ "mimetype" : " text/x-python" ,
284+ "name" : " python" ,
285+ "nbconvert_exporter" : " python" ,
286+ "pygments_lexer" : " ipython3" ,
287+ "version" : " 3.8.7"
288+ },
289+ "mimetype" : " text/x-python" ,
290+ "name" : " python" ,
291+ "npconvert_exporter" : " python" ,
292+ "pygments_lexer" : " ipython3" ,
293+ "version" : 3 ,
294+ "interpreter" : {
295+ "hash" : " 777490da48e046e3b512f0b24bf037db286a787493a11bf82a9e0f2cbf21bb67"
296+ }
297+ },
298+ "nbformat" : 4 ,
299+ "nbformat_minor" : 4
300+ }
0 commit comments