Skip to content

Commit f515a9c

Browse files
committed
week 3 challenge
1 parent 8ffd3a5 commit f515a9c

File tree

1 file changed

+322
-0
lines changed

1 file changed

+322
-0
lines changed
Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": [],
7+
"gpuType": "T4",
8+
"authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR"
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
},
14+
"language_info": {
15+
"name": "python"
16+
},
17+
"accelerator": "GPU"
18+
},
19+
"cells": [
20+
{
21+
"cell_type": "code",
22+
"execution_count": null,
23+
"metadata": {
24+
"id": "yqlQTsxNdKrN"
25+
},
26+
"outputs": [],
27+
"source": [
28+
"!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"source": [
34+
"import os\n",
35+
"import requests\n",
36+
"from IPython.display import Markdown, display, update_display\n",
37+
"from openai import OpenAI\n",
38+
"from google.colab import drive\n",
39+
"from huggingface_hub import login\n",
40+
"from google.colab import userdata\n",
41+
"from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
42+
"import torch\n",
43+
"import gradio as gr\n",
44+
"import re"
45+
],
46+
"metadata": {
47+
"id": "eyfvQrLxdkGT"
48+
},
49+
"execution_count": 2,
50+
"outputs": []
51+
},
52+
{
53+
"cell_type": "code",
54+
"source": [
55+
"# one can always add more models, of course\n",
56+
"\n",
57+
"LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
58+
"OPENAI_MODEL = \"gpt-4o-mini\""
59+
],
60+
"metadata": {
61+
"id": "WW-cSZk7dnp6"
62+
},
63+
"execution_count": 3,
64+
"outputs": []
65+
},
66+
{
67+
"cell_type": "code",
68+
"source": [
69+
"hf_token = userdata.get('HF_TOKEN')\n",
70+
"login(hf_token, add_to_git_credential=True)\n",
71+
"openai_api_key = userdata.get('OPENAI_API_KEY')\n",
72+
"openai = OpenAI(api_key=openai_api_key)"
73+
],
74+
"metadata": {
75+
"id": "XG7Iam6Rdw8F"
76+
},
77+
"execution_count": 4,
78+
"outputs": []
79+
},
80+
{
81+
"cell_type": "code",
82+
"source": [
83+
"force_dark_mode = \"\"\"\n",
84+
"function refresh() {\n",
85+
" const url = new URL(window.location);\n",
86+
" if (url.searchParams.get('__theme') !== 'dark') {\n",
87+
" url.searchParams.set('__theme', 'dark');\n",
88+
" window.location.href = url.href;\n",
89+
" }\n",
90+
"}\n",
91+
"\"\"\""
92+
],
93+
"metadata": {
94+
"id": "Ov7WSdx9dzSt"
95+
},
96+
"execution_count": 5,
97+
"outputs": []
98+
},
99+
{
100+
"cell_type": "code",
101+
"source": [
102+
"def dataset_generator(model, nature, shots, volume, language):\n",
103+
"\n",
104+
" examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
105+
" system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
106+
" user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
107+
" sentences = \"\"\n",
108+
"\n",
109+
" if language == \"English\":\n",
110+
"\n",
111+
" for shot in list(shots.keys()):\n",
112+
" examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
113+
"\n",
114+
" system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
115+
"Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
116+
"but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
117+
"Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
118+
"\n",
119+
" user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
120+
"Do not comment or format yor output in any way, shape, or form.\"\n",
121+
"\n",
122+
" elif language == \"German\":\n",
123+
"\n",
124+
" for shot in list(shots.keys()):\n",
125+
" examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
126+
"\n",
127+
" system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
128+
"Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
129+
"plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
130+
"Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
131+
"\n",
132+
" user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
133+
"Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
134+
"\n",
135+
" elif language == \"French\":\n",
136+
"\n",
137+
" for shot in list(shots.keys()):\n",
138+
" examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
139+
"\n",
140+
" system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
141+
"Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
142+
"mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
143+
"Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
144+
"\n",
145+
" user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
146+
"Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
147+
"\n",
148+
" messages = [\n",
149+
" {\"role\": \"system\", \"content\": system_message},\n",
150+
" {\"role\": \"user\", \"content\": user_prompt}\n",
151+
" ]\n",
152+
"\n",
153+
" if model == \"Llama\":\n",
154+
"\n",
155+
" quant_config = BitsAndBytesConfig(\n",
156+
" load_in_4bit=True,\n",
157+
" bnb_4bit_use_double_quant=True,\n",
158+
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
159+
" bnb_4bit_quant_type=\"nf4\"\n",
160+
" )\n",
161+
"\n",
162+
" tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
163+
" tokenizer.pad_token = tokenizer.eos_token\n",
164+
" inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
165+
" streamer = TextStreamer(tokenizer)\n",
166+
" model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
167+
" outputs = model.generate(inputs, max_new_tokens=10000)\n",
168+
"\n",
169+
" response = tokenizer.decode(outputs[0])\n",
170+
" sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
171+
"\n",
172+
" elif model == \"OpenAI\":\n",
173+
" response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
174+
" sentences = response.choices[0].message.content\n",
175+
"\n",
176+
" return sentences"
177+
],
178+
"metadata": {
179+
"id": "bEF8w_Mdd2Nb"
180+
},
181+
"execution_count": 7,
182+
"outputs": []
183+
},
184+
{
185+
"cell_type": "code",
186+
"source": [
187+
"global data\n",
188+
"data = \"\"\n",
189+
"\n",
190+
"with gr.Blocks(\n",
191+
" css=\"\"\"\n",
192+
" .red-button {\n",
193+
" background-color: darkred !important;\n",
194+
" border-color: red !important;\n",
195+
" }\n",
196+
" .blue-button {\n",
197+
" background-color: darkblue !important;\n",
198+
" border-color: blue !important;\n",
199+
" }\n",
200+
" .green-button {\n",
201+
" background-color: green !important;\n",
202+
" border-color: green !important;\n",
203+
" }\n",
204+
" \"\"\"\n",
205+
") as view:\n",
206+
" with gr.Row():\n",
207+
" title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
208+
" subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
209+
"<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
210+
"<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
211+
"<li>Hit <q>Create Dataset</q></li>\\\n",
212+
"<li>Save the output (.txt) to your Google Drive</li>\")\n",
213+
" with gr.Row():\n",
214+
" language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
215+
" model_choice = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
216+
" volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
217+
" with gr.Row():\n",
218+
" typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
219+
" with gr.Row():\n",
220+
" sentence_1 = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
221+
" instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
222+
" with gr.Row():\n",
223+
" sentence_2 = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
224+
" instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
225+
" with gr.Row():\n",
226+
" sentence_3 = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
227+
" instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
228+
" with gr.Row():\n",
229+
" liveSentences = gr.Markdown(\n",
230+
" value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
231+
" label=\"Generated sentences:\",\n",
232+
" min_height=60,\n",
233+
" max_height=200\n",
234+
" )\n",
235+
" with gr.Row():\n",
236+
" generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
237+
" with gr.Row():\n",
238+
" clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
239+
" with gr.Row():\n",
240+
" outputPath = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
241+
" with gr.Row():\n",
242+
" save = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
243+
"\n",
244+
" def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
245+
" global data\n",
246+
" nature = \"\"\n",
247+
" shots = {}\n",
248+
" amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
249+
"\n",
250+
" if typeInput != None:\n",
251+
" nature = typeInput\n",
252+
" else:\n",
253+
" nature = \"Random sentences of mixed nature\"\n",
254+
"\n",
255+
" if s1 != None:\n",
256+
" if i1 != None:\n",
257+
" shots[i1] = s1\n",
258+
" else:\n",
259+
" shots[\"A medium-long random sentence about anything\"] = s1\n",
260+
" else:\n",
261+
" shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
262+
"\n",
263+
" if s2 != None:\n",
264+
" if i2 != None:\n",
265+
" shots[i2] = s2\n",
266+
" else:\n",
267+
" shots[\"A medium-long random sentence about anything\"] = s2\n",
268+
"\n",
269+
" if s3 != None:\n",
270+
" if i3 != None:\n",
271+
" shots[i3] = s3\n",
272+
" else:\n",
273+
" shots[\"A medium-long random sentence about anything\"] = s3\n",
274+
"\n",
275+
" sentences = dataset_generator(model, nature, shots, amount, language)\n",
276+
" data = sentences\n",
277+
"\n",
278+
" return sentences\n",
279+
"\n",
280+
" def saveData(path):\n",
281+
" global data\n",
282+
" drive.mount(\"/content/drive\")\n",
283+
"\n",
284+
" dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
285+
"\n",
286+
" if not os.path.exists(dir_path):\n",
287+
" os.makedirs(dir_path)\n",
288+
"\n",
289+
" with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
290+
" f.write(data)\n",
291+
"\n",
292+
" generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
293+
" clear.click(\n",
294+
" lambda: [\n",
295+
" gr.update(value=\"\"),\n",
296+
" gr.update(value=\"\"),\n",
297+
" gr.update(value=\"\"),\n",
298+
" gr.update(value=\"\"),\n",
299+
" gr.update(value=\"\"),\n",
300+
" gr.update(value=\"\"),\n",
301+
" gr.update(value=\"\"),\n",
302+
" gr.update(value=\"\"),\n",
303+
" gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
304+
" gr.update(value=\"\"),\n",
305+
" gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
306+
" None,\n",
307+
" [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
308+
" sentence_3, instruction_3, liveSentences, outputPath, save],\n",
309+
" queue=False\n",
310+
" )\n",
311+
" save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
312+
"\n",
313+
"view.launch(share=True) #, debug=True)"
314+
],
315+
"metadata": {
316+
"id": "VRKdu0fEt8mg"
317+
},
318+
"execution_count": null,
319+
"outputs": []
320+
}
321+
]
322+
}

0 commit comments

Comments
 (0)