week 3 challenge

udomai · udomai · commit f515a9c8c009 · 2025-02-23T20:18:41.000+01:00
diff --git a/week3/community-contributions/en-de-fr_dataset_generator.ipynb b/week3/community-contributions/en-de-fr_dataset_generator.ipynb
@@ -0,0 +1,322 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyPxJzufoQPtui+nhl1J1xiR"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yqlQTsxNdKrN"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import requests\n",
+        "from IPython.display import Markdown, display, update_display\n",
+        "from openai import OpenAI\n",
+        "from google.colab import drive\n",
+        "from huggingface_hub import login\n",
+        "from google.colab import userdata\n",
+        "from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig\n",
+        "import torch\n",
+        "import gradio as gr\n",
+        "import re"
+      ],
+      "metadata": {
+        "id": "eyfvQrLxdkGT"
+      },
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# one can always add more models, of course\n",
+        "\n",
+        "LLAMA = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
+        "OPENAI_MODEL = \"gpt-4o-mini\""
+      ],
+      "metadata": {
+        "id": "WW-cSZk7dnp6"
+      },
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "hf_token = userdata.get('HF_TOKEN')\n",
+        "login(hf_token, add_to_git_credential=True)\n",
+        "openai_api_key = userdata.get('OPENAI_API_KEY')\n",
+        "openai = OpenAI(api_key=openai_api_key)"
+      ],
+      "metadata": {
+        "id": "XG7Iam6Rdw8F"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "force_dark_mode = \"\"\"\n",
+        "function refresh() {\n",
+        "    const url = new URL(window.location);\n",
+        "    if (url.searchParams.get('__theme') !== 'dark') {\n",
+        "        url.searchParams.set('__theme', 'dark');\n",
+        "        window.location.href = url.href;\n",
+        "    }\n",
+        "}\n",
+        "\"\"\""
+      ],
+      "metadata": {
+        "id": "Ov7WSdx9dzSt"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def dataset_generator(model, nature, shots, volume, language):\n",
+        "\n",
+        "  examples = \"Instruction: 'Make a random sentence.'\\nAnswer: 'When I got home last night, I couldn't believe my eyes: All the pineapples had been removed from the pizza.'\"\n",
+        "  system_message = \"You are a random sentence generator. Generate 10 diverse English sentences.\"\n",
+        "  user_prompt = f\"Generate 10 random English sentences, like so:\\n{examples}\"\n",
+        "  sentences = \"\"\n",
+        "\n",
+        "  if language == \"English\":\n",
+        "\n",
+        "    for shot in list(shots.keys()):\n",
+        "      examples += f\"\\nExample instruction: '{shot}'\\nExample answer: '{shots[shot]}'\\n\"\n",
+        "\n",
+        "    system_message = f\"You are a state-of-the art linguistic dataset compiler. You are given a 'Type' of sentence to create. \\\n",
+        "Within the bounds of that type, create {volume} diverse sentences with differing structures and lengths. Make the sentences plausible, \\\n",
+        "but be creative in filling them with random concrete information, names, and data. Here are some examples for how to go about that:\\n{examples}\\n\\\n",
+        "Just output one sentence per line. Do not comment or format yor output in any way, shape, or form.\"\n",
+        "\n",
+        "    user_prompt = f\"Generate {volume} English sentences of the following Type: {nature}. Just output one sentence per line. \\\n",
+        "Do not comment or format yor output in any way, shape, or form.\"\n",
+        "\n",
+        "  elif language == \"German\":\n",
+        "\n",
+        "    for shot in list(shots.keys()):\n",
+        "      examples += f\"\\nAnweisung: '{shot}'\\nAntwort: '{shots[shot]}'\\n\"\n",
+        "\n",
+        "    system_message = f\"Du bist ein weltklasse Datensatz-Sammler für Sprachdaten. Du erhältst einen 'Typ' von Sätzen, die du erstellen sollst. \\\n",
+        "Im Rahmen dieses Typs, generiere {volume} untereinander verschiedene Sätze mit unterschiedlichen Satzlängen und -strukturen. Mache die Beispielsätze \\\n",
+        "plausibel, aber fülle sie kreativ mit willkürlichen Informationen, Namen, und Daten aller Art. Hier sind ein paar Beispiel, wie du vorgehen sollst:\\n{examples}\\n\\\n",
+        "Gib einfach einen Satz pro Zeile aus. Kommentiere oder formatiere deine Antwort in keinster Weise.\"\n",
+        "\n",
+        "    user_prompt = f\"Generiere {volume} deutsche Sätze des folgenden Typs: {nature}. Gib einfach einen Satz pro Zeile aus. \\\n",
+        "Kommentiere oder formatiere deine Antwort in keiner Weise.\"\n",
+        "\n",
+        "  elif language == \"French\":\n",
+        "\n",
+        "    for shot in list(shots.keys()):\n",
+        "      examples += f\"\\nConsigne: '{shot}'\\nRéponse: '{shots[shot]}'\\n\"\n",
+        "\n",
+        "    system_message = f\"Tu es un outil linguistique de pointe, à savoir, un genérateur de données linguistiques. Tu seras assigné un 'Type' de phrases à créer. \\\n",
+        "Dans le cadre de ce type-là, crée {volume} phrases diverses, avec des structures et longueurs qui varient. Génère des phrases qui soient plausibles, \\\n",
+        "mais sois créatif, et sers-toi de données, noms, et informations aléatoires pour rendre les phrases plus naturelles. Voici quelques examples comment faire:\\n{examples}\\n\\\n",
+        "Sors une seule phrase par ligne. Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
+        "\n",
+        "    user_prompt = f\"S'il te plaît, crée {volume} phrases en français du Type suivant: {nature}. Sors une seule phrase par ligne. \\\n",
+        "Ne formatte ni commente ta réponse en aucune manière que ce soit.\"\n",
+        "\n",
+        "  messages = [\n",
+        "      {\"role\": \"system\", \"content\": system_message},\n",
+        "      {\"role\": \"user\", \"content\": user_prompt}\n",
+        "    ]\n",
+        "\n",
+        "  if model == \"Llama\":\n",
+        "\n",
+        "    quant_config = BitsAndBytesConfig(\n",
+        "        load_in_4bit=True,\n",
+        "        bnb_4bit_use_double_quant=True,\n",
+        "        bnb_4bit_compute_dtype=torch.bfloat16,\n",
+        "        bnb_4bit_quant_type=\"nf4\"\n",
+        "    )\n",
+        "\n",
+        "    tokenizer = AutoTokenizer.from_pretrained(LLAMA)\n",
+        "    tokenizer.pad_token = tokenizer.eos_token\n",
+        "    inputs = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(\"cuda\")\n",
+        "    streamer = TextStreamer(tokenizer)\n",
+        "    model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map=\"auto\", quantization_config=quant_config)\n",
+        "    outputs = model.generate(inputs, max_new_tokens=10000)\n",
+        "\n",
+        "    response  = tokenizer.decode(outputs[0])\n",
+        "    sentences = list(re.finditer(\"(?:<\\|end_header_id\\|>)([^<]+)(?:<\\|eot_id\\|>)\", str(response), re.DOTALL))[-1].group(1)\n",
+        "\n",
+        "  elif model == \"OpenAI\":\n",
+        "    response = openai.chat.completions.create(model=OPENAI_MODEL, messages=messages)\n",
+        "    sentences = response.choices[0].message.content\n",
+        "\n",
+        "  return sentences"
+      ],
+      "metadata": {
+        "id": "bEF8w_Mdd2Nb"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "global data\n",
+        "data = \"\"\n",
+        "\n",
+        "with gr.Blocks(\n",
+        "        css=\"\"\"\n",
+        "    .red-button {\n",
+        "        background-color: darkred !important;\n",
+        "        border-color: red !important;\n",
+        "    }\n",
+        "    .blue-button {\n",
+        "        background-color: darkblue !important;\n",
+        "        border-color: blue !important;\n",
+        "    }\n",
+        "    .green-button {\n",
+        "        background-color: green !important;\n",
+        "        border-color: green !important;\n",
+        "    }\n",
+        "    \"\"\"\n",
+        ") as view:\n",
+        "  with gr.Row():\n",
+        "    title = gr.HTML(\"<h1><big>D</big>ataset Generator <small>PLUS</small></h1><h2>for English, German, and French</h2>\")\n",
+        "    subtitle = gr.HTML(\"<h3>Instructions:</h3><ol><li>Pick the language</li>\\\n",
+        "<li>Select a model</li><li>Indicate how many sentences you need</li>\\\n",
+        "<li>Describe the type of sentence you're looking for</li><li>Give up to three examples of the desired output sentence, and describe each of them briefly</li>\\\n",
+        "<li>Hit <q>Create Dataset</q></li>\\\n",
+        "<li>Save the output (.txt) to your Google Drive</li>\")\n",
+        "  with gr.Row():\n",
+        "    language_choice = gr.Dropdown(choices=[\"English\", \"German\", \"French\"], label=\"Select language\", value=\"English\", interactive=True)\n",
+        "    model_choice    = gr.Dropdown(choices=[\"Llama\", \"OpenAI\"], label=\"Select model\", value=\"Llama\", interactive=True)\n",
+        "    volume = gr.Textbox(label=\"Required number of sentences\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    typeInput = gr.Textbox(label=\"Short description of the kind of sentence you need\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    sentence_1    = gr.Textbox(label=\"Example sentence 1\", interactive=True)\n",
+        "    instruction_1 = gr.Textbox(label=\"Description\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    sentence_2    = gr.Textbox(label=\"Example sentence 2\", interactive=True)\n",
+        "    instruction_2 = gr.Textbox(label=\"Description\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    sentence_3    = gr.Textbox(label=\"Example sentence 3\", interactive=True)\n",
+        "    instruction_3 = gr.Textbox(label=\"Description\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    liveSentences = gr.Markdown(\n",
+        "        value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>',\n",
+        "        label=\"Generated sentences:\",\n",
+        "         min_height=60,\n",
+        "         max_height=200\n",
+        "        )\n",
+        "  with gr.Row():\n",
+        "    generate = gr.Button(value=\"Generate sentences\", elem_classes=\"blue-button\")\n",
+        "  with gr.Row():\n",
+        "    clear = gr.Button(value=\"Clear everything\", elem_classes=\"red-button\")\n",
+        "  with gr.Row():\n",
+        "    outputPath  = gr.Textbox(label=\"Specify the desired name and location on your Google Drive for the sentences (plain text) to be saved\", interactive=True)\n",
+        "  with gr.Row():\n",
+        "    save  = gr.Button(value=\"Save generated data\", elem_classes=\"blue-button\")\n",
+        "\n",
+        "  def generateSentences(typeInput, s1, i1, s2, i2, s3, i3, volume, language, model):\n",
+        "    global data\n",
+        "    nature = \"\"\n",
+        "    shots = {}\n",
+        "    amount = int(volume) if re.search(\"^[0-9]+$\", volume) is not None else 10\n",
+        "\n",
+        "    if typeInput != None:\n",
+        "      nature = typeInput\n",
+        "    else:\n",
+        "      nature = \"Random sentences of mixed nature\"\n",
+        "\n",
+        "    if s1 != None:\n",
+        "      if i1 != None:\n",
+        "        shots[i1] = s1\n",
+        "      else:\n",
+        "        shots[\"A medium-long random sentence about anything\"] = s1\n",
+        "    else:\n",
+        "      shots[\"A medium-long random sentence about anything\"] = \"Paul, waking up out of his half-drunken haze, clearly couldn't tell left from right and ran right into the door.\"\n",
+        "\n",
+        "    if s2 != None:\n",
+        "      if i2 != None:\n",
+        "        shots[i2] = s2\n",
+        "      else:\n",
+        "        shots[\"A medium-long random sentence about anything\"] = s2\n",
+        "\n",
+        "    if s3 != None:\n",
+        "      if i3 != None:\n",
+        "        shots[i3] = s3\n",
+        "      else:\n",
+        "        shots[\"A medium-long random sentence about anything\"] = s3\n",
+        "\n",
+        "    sentences = dataset_generator(model, nature, shots, amount, language)\n",
+        "    data = sentences\n",
+        "\n",
+        "    return sentences\n",
+        "\n",
+        "  def saveData(path):\n",
+        "    global data\n",
+        "    drive.mount(\"/content/drive\")\n",
+        "\n",
+        "    dir_path = os.path.dirname(\"/content/drive/MyDrive/\" + path)\n",
+        "\n",
+        "    if not os.path.exists(dir_path):\n",
+        "      os.makedirs(dir_path)\n",
+        "\n",
+        "    with open(\"/content/drive/MyDrive/\" + path, \"w\", encoding=\"utf-8\") as f:\n",
+        "      f.write(data)\n",
+        "\n",
+        "  generate.click(generateSentences, inputs=[typeInput, sentence_1, instruction_1, sentence_2, instruction_2, sentence_3, instruction_3, volume, language_choice, model_choice], outputs=liveSentences)\n",
+        "  clear.click(\n",
+        "      lambda: [\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value='<div style=\"color: #999; padding: 10px;\">Your sentences will be displayed here …</div>'),\n",
+        "          gr.update(value=\"\"),\n",
+        "          gr.update(value=\"Save generated data\", elem_classes=\"blue-button\")],\n",
+        "      None,\n",
+        "      [volume, typeInput, sentence_1, instruction_1, sentence_2, instruction_2,\n",
+        "         sentence_3, instruction_3, liveSentences, outputPath, save],\n",
+        "      queue=False\n",
+        "      )\n",
+        "  save.click(saveData, inputs=outputPath, outputs=None).then(lambda: gr.update(value=\"Your data has been saved\", elem_classes=\"green-button\"), [], [save])\n",
+        "\n",
+        "view.launch(share=True) #, debug=True)"
+      ],
+      "metadata": {
+        "id": "VRKdu0fEt8mg"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}