diff --git a/Custom_Chatgpt_App_Langchain.ipynb b/Custom_Chatgpt_App_Langchain.ipynb new file mode 100644 index 00000000..ee23d6cb --- /dev/null +++ b/Custom_Chatgpt_App_Langchain.ipynb @@ -0,0 +1,236 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "mount_file_id": "1zRgNVxa_MvfWHSZL1XtHbm3qTsgNZq36", + "authorship_tag": "ABX9TyOJFcxTO8G5Qany7+BJ5/vY", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**CUSTOM CHAT GPT APP WITH LANGCHAIN**" + ], + "metadata": { + "id": "ia4UcI3Q-2ar" + } + }, + { + "cell_type": "code", + "source": [ + "pip install langchain" + ], + "metadata": { + "id": "xvdf7qBb_ZEJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install openai" + ], + "metadata": { + "id": "kppbuXGkANgf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install cohere" + ], + "metadata": { + "id": "poyvXwZ9AN9Z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pip install tiktoken" + ], + "metadata": { + "id": "cGhZrptqAy1z" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "os.environ['OPENAI_API_KEY']= \"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\"" + ], + "metadata": { + "id": "GagnfiP1BJh_" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema import SystemMessage\n", + "from langchain.chains import LLMChain\n", + "from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder\n", + "from langchain.memory import ConversationBufferMemory, FileChatMessageHistory\n", + "\n", + "llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=1)\n", + "\n", + "history = FileChatMessageHistory('chat_history.txt')\n", + "\n", + "memory = ConversationBufferMemory(\n", + " memory_key='chat_history',\n", + " chat_memory=history,\n", + " return_messages=True\n", + "\n", + ")\n", + "\n", + "prompt = ChatPromptTemplate(\n", + " input_variables=['content'],\n", + " messages=[\n", + " SystemMessage(content='You are chatbot having a conversation with a human.'),\n", + " MessagesPlaceholder(variable_name='chat_history'),\n", + " HumanMessagePromptTemplate.from_template('{content}')\n", + " ]\n", + "\n", + ")\n", + "\n", + "chain = LLMChain(\n", + " llm=llm,\n", + " memory=memory,\n", + " prompt=prompt,\n", + " verbose=True\n", + ")\n", + "\n", + "while True:\n", + " content = input('Your prompt: ').strip() # Remove leading/trailing whitespaces\n", + " if not content:\n", + " print('Please enter a prompt.')\n", + " continue\n", + "\n", + " if content in ['quit', 'exit', 'bye']:\n", + " print('Goodbye! ')\n", + " break\n", + "\n", + " response = chain.run({'content': content})\n", + " print(response)\n", + " print('-' * 50)\n", + "\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EOrUGlyyBTcI", + "outputId": "1e7fbaae-2e12-4555-cfa9-d40567e033c1" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Your prompt: what is the distance of earth from the sun\n", + "\n", + "\n", + "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mSystem: You are chatbot having a conversation with a human.\n", + "Human: what is the distance of earth from the sun\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "The average distance between Earth and the Sun is approximately 93 million miles or about 150 million kilometers. This is known as an astronomical unit (AU) and is used as a standard unit to measure distances within our solar system.\n", + "--------------------------------------------------\n", + "Your prompt: what is the speed of light\n", + "\n", + "\n", + "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mSystem: You are chatbot having a conversation with a human.\n", + "Human: what is the distance of earth from the sun\n", + "AI: The average distance between Earth and the Sun is approximately 93 million miles or about 150 million kilometers. This is known as an astronomical unit (AU) and is used as a standard unit to measure distances within our solar system.\n", + "Human: what is the speed of light\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "The speed of light is approximately 186,282 miles per second (299,792 kilometers per second) in a vacuum. It is considered a universal constant and is denoted by the symbol \"c\" in physics equations. The speed of light is incredibly fast, and it is often used as a reference when discussing the vast distances and timescales in our universe.\n", + "--------------------------------------------------\n", + "Your prompt: how long does it take to reach earth\n", + "\n", + "\n", + "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mSystem: You are chatbot having a conversation with a human.\n", + "Human: what is the distance of earth from the sun\n", + "AI: The average distance between Earth and the Sun is approximately 93 million miles or about 150 million kilometers. This is known as an astronomical unit (AU) and is used as a standard unit to measure distances within our solar system.\n", + "Human: what is the speed of light\n", + "AI: The speed of light is approximately 186,282 miles per second (299,792 kilometers per second) in a vacuum. It is considered a universal constant and is denoted by the symbol \"c\" in physics equations. The speed of light is incredibly fast, and it is often used as a reference when discussing the vast distances and timescales in our universe.\n", + "Human: how long does it take to reach earth\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "To clarify, if you're referring to how long it takes for light from the Sun to reach Earth, it takes approximately 8 minutes and 20 seconds. This is because light travels at the speed of 299,792 kilometers per second, and with the average distance between the Sun and Earth being around 93 million miles or 150 million kilometers, it takes around 8 minutes and 20 seconds for light to travel this distance.\n", + "\n", + "However, if you are referring to how long it takes for something to physically travel from a certain distance to Earth, the time would depend on the speed of the object and the distance it needs to cover.\n", + "--------------------------------------------------\n", + "Your prompt: exit\n", + "Goodbye! \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "L2oUtUdABTfg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "qhDd7NsgBTit" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "C3E_3dAqBTmV" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/LLM_Question_Answering_Application.ipynb b/LLM_Question_Answering_Application.ipynb new file mode 100644 index 00000000..52a457bd --- /dev/null +++ b/LLM_Question_Answering_Application.ipynb @@ -0,0 +1,207 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "mount_file_id": "10E5yOrYsLu1k3zhTg0ji9QG3o690Q_Wh", + "authorship_tag": "ABX9TyNYWsOTZcD6wahQCgxO8jJj", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CrgGY3vJ0nYV" + }, + "outputs": [], + "source": [ + "pip install -r /content/drive/MyDrive/Frontend/requirements.txt\n", + "pip install chromadb\n", + "pip install tiktoken\n", + "pip install cohere\n" + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "import streamlit as st\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import Chroma\n", + "import os\n", + "\n", + "\n", + "\n", + "def load_document(file):\n", + " import os\n", + " name, extension = os.path.splitext(file)\n", + "\n", + "\n", + " if extension == '.pdf':\n", + " from langchain.document_loaders import PyPDFLoader\n", + " print(f'Loading {file}')\n", + " loader = PyPDFLoader(file)\n", + " elif extension == '.docx':\n", + " from langchain.document_loaders import Docx2txtLoader\n", + " print(f'Loading {file}')\n", + " loader = Docx2txtLoader(file)\n", + " elif extension == '.txt':\n", + " from langchain.document_loaders import TextLoader\n", + " print(f'Loading {file}')\n", + " loader = TextLoader(file)\n", + " else:\n", + " print('Doc format is not supported')\n", + " return None\n", + "\n", + " data = loader.load()\n", + " return data\n", + "\n", + "\n", + "\n", + "\n", + "def chunk_data(data, chunk_size=256, chunk_overlap=20):\n", + " from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + " text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " chunks = text_splitter.split_documents(data)\n", + " return chunks\n", + "\n", + "\n", + "\n", + "def create_embeddings(chunks):\n", + " embeddings = OpenAIEmbeddings()\n", + " vector_store = Chroma.from_documents(chunks, embeddings)\n", + " return vector_store\n", + "\n", + "\n", + "\n", + "def ask_and_get_answer(vector_store, q, k=3):\n", + " from langchain.chains import RetrievalQA\n", + " from langchain.chat_models import ChatOpenAI\n", + "\n", + " llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=2)\n", + "\n", + " retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':k})\n", + "\n", + " chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)\n", + "\n", + " answer = chain.run(q)\n", + " return answer\n", + "\n", + "\n", + "\n", + "def calculate_embedding_cost(texts):\n", + " import tiktoken\n", + " enc = tiktoken.encoding_for_model('text-embedding-ada-002')\n", + " total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])\n", + " #print(f'Total tokens: {total_tokens}')\n", + " #print(f'Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}')\n", + " return total_tokens, total_tokens / 1000 * 0.0004\n", + "\n", + "\n", + "def clear_history():\n", + " if 'history' in st.session_state:\n", + " del st.session_state['history']\n", + "\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " import os\n", + " from dotenv import load_dotenv, find_dotenv\n", + " load_dotenv(find_dotenv(), override=True)\n", + "\n", + "\n", + " st.image('/content/drive/MyDrive/Frontend/img.jpeg')\n", + " st.subheader('LLM Question-Answering Application')\n", + " with st.sidebar:\n", + " api_key = st.text_input('OpenAI API Key: ', type='password')\n", + " if api_key:\n", + " os.environ['OPENAI_API_KEY'] = api_key\n", + "\n", + " uploaded_file = st.file_uploader('Upload a file:', type=['pdf', 'docx', 'txt'])\n", + " chunk_size = st.number_input('Chunk size:', min_value=100, max_value=2048, value=512, on_change=clear_history)\n", + " k = st.number_input('k', min_value=1, max_value=20,value=3, on_change=clear_history)\n", + " add_data = st.button('Add Data', on_click=clear_history)\n", + "\n", + " if uploaded_file and add_data:\n", + " with st.spinner('Reading, chunking, embedding file...'):\n", + " bytes_data = uploaded_file.read()\n", + " file_name = os.path.join('./', uploaded_file.name)\n", + " with open(file_name, 'wb') as f:\n", + " f.write(bytes_data)\n", + "\n", + " data = load_document(file_name)\n", + " chunks = chunk_data(data, chunk_size=chunk_size)\n", + " st.write(f'Chunk size: {chunk_size}, chunks: {len(chunks)}')\n", + "\n", + "\n", + " tokens, embedding_cost = calculate_embedding_cost(chunks)\n", + " st.write(f'Embedding cost: ${embedding_cost:.4f}')\n", + "\n", + " vector_store = create_embeddings(chunks)\n", + "\n", + " st.session_state.vs = vector_store\n", + "\n", + " st.success('File uploaded, Chunked and Embedded successfully.')\n", + "\n", + "\n", + " q = st.text_input('Ask a question about the content of your file:')\n", + " if q:\n", + " if 'vs' in st.session_state:\n", + " vector_store = st.session_state.vs\n", + " #st.write(f'k: {k}')\n", + " answer = ask_and_get_answer(vector_store, q, k)\n", + " st.text_area('LLM Answer:', value=answer)\n", + "\n", + "\n", + " st.divider()\n", + " if 'history' not in st.session_state:\n", + " st.session_state.history = ' '\n", + "\n", + " value = f'Q: {q} \\nA: {answer} '\n", + " st.session_state.history = f'{value} \\n {\"-\" * 100} \\n {st.session_state.history}'\n", + " h = st.session_state.history\n", + " st.text_area(label='Chat History', value=h, key='history', height=400)\n", + "\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "Cw2UIcc22jjQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]\n" + ], + "metadata": { + "id": "q_02qlJzh2Jd" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file