diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb index 1d6eab48aea..91a3aec6099 100644 --- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb +++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb @@ -129,7 +129,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "169978da6d5942bd8146676f3bf5db8b", + "model_id": "1e8ca46ac6734f8c816a14cbe46964ce", "version_major": 2, "version_minor": 0 }, @@ -223,15 +223,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-12 21:55:41.474562: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-12-12 21:55:41.476575: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-12 21:55:41.501573: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2023-12-12 21:55:41.501593: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2023-12-12 21:55:41.501613: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2023-12-12 21:55:41.506678: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-12 21:55:41.507421: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "2023-12-21 21:33:05.855788: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-12-21 21:33:05.857870: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-12-21 21:33:05.883126: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2023-12-21 21:33:05.883147: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2023-12-21 21:33:05.883167: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2023-12-21 21:33:05.888388: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-12-21 21:33:05.889023: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-12-12 21:55:42.037050: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + "2023-12-21 21:33:06.449452: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], @@ -292,7 +292,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "671a0e89103d496eb4cdce5f6c70904a", + "model_id": "c802a1fb556c4abdb38b967c02ef3ef6", "version_major": 2, "version_minor": 0 }, @@ -306,7 +306,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f6376de7c87145f2a00de1c3e6edf4c6", + "model_id": "43b1bd84b5ef4fb0b015411fa3edc862", "version_major": 2, "version_minor": 0 }, @@ -320,7 +320,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1126a5495fb5426588da00e518586bde", + "model_id": "ec15e0c8aaa54fc080d9d8d8938c233a", "version_major": 2, "version_minor": 0 }, @@ -593,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea", "metadata": { "tags": [] @@ -602,15 +602,15 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4af32a190c7a4896a06743fe05c7b56b", + "model_id": "54ae70217dbd4299974e24aae599957e", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU')" + "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -637,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a", "metadata": { "tags": [] @@ -659,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "3536a1a7", "metadata": { "collapsed": false, @@ -671,15 +671,15 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0f954bac863d48f5ab0b9eb779f0a82d", + "model_id": "03dc64b5e12e4fb79fb36a63ffef2ef2", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')" + "Dropdown(description='Model to run:', options=('FP16',), value='FP16')" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -705,7 +705,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "7a041101-7336-40fd-96c9-cd298015a0f3", "metadata": { "tags": [] @@ -715,7 +715,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loading model from chatglm2-6b/INT4_compressed_weights\n" + "Loading model from chatglm3-6b/FP16\n" ] }, { @@ -760,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "8f6f7596-5677-4931-875b-aaabfa23cabc", "metadata": {}, "outputs": [ @@ -768,7 +768,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ea/work/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", " self.request.start_async(inputs, shared_memory=True)\n" ] }, @@ -833,12 +833,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "01f8f7f8-072e-45dc-b7c9-18d8c3c47754", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://10.3.233.70:4768\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + " self.request.start_async(inputs, shared_memory=True)\n" + ] + } + ], "source": [ "from threading import Event, Thread\n", "from uuid import uuid4\n", @@ -922,7 +960,7 @@ "\n", "def default_partial_text_processor(partial_text: str, new_text: str):\n", " \"\"\"\n", - " helper for updating partially generated answer, used by de\n", + " helper for updating partially generated answer, used by default\n", "\n", " Params:\n", " partial_text: text buffer for storing previosly generated text\n", @@ -972,7 +1010,7 @@ " return text\n", "\n", "\n", - "def user(text, history):\n", + "def user(message, history):\n", " \"\"\"\n", " callback function for updating user messages in interface on submit button click\n", "\n", @@ -983,8 +1021,7 @@ " None\n", " \"\"\"\n", " # Append the user's message to the conversation history\n", - " history = history + [(text, None)]\n", - " return \"\", history\n", + " return \"\", history + [[message, \"\"]]\n", "\n", "\n", "def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):\n", @@ -1184,13 +1221,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "7b837f9e-4152-4a5c-880a-ed874aa64a74", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Closing server running on port: 4768\n" + ] + } + ], "source": [ "# please run this cell for stopping gradio interface\n", - "# demo.close()" + "demo.close()" ] } ], diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb index 4628c583daf..f6d1def9712 100644 --- a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb +++ b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb @@ -51,10 +51,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "1f077b32-5d36-44b0-9041-407e996283a3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Skipping openvino-dev as it is not installed.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Skipping openvino as it is not installed.\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip uninstall -q -y openvino-dev openvino openvino-nightly\n", "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\\\n", @@ -62,7 +76,7 @@ "\"nncf>=2.7\"\\\n", "\"openvino-nightly\"\\\n", "\"gradio\"\\\n", - "\"onnx\" \"chromadb\" \"sentence_transformers\" \"langchain\" \"langchainhub\" \"transformers>=4.34.0\" \"unstructured\" \"scikit-learn\"" + "\"onnx\" \"chromadb\" \"sentence_transformers\" \"langchain\" \"langchainhub\" \"transformers>=4.34.0\" \"unstructured\" \"scikit-learn\" \"python-docx\"" ] }, { @@ -112,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "d3b57cfb-e727-43a5-b2c9-8f1b1ba72061", "metadata": {}, "outputs": [ @@ -127,15 +141,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-12-12 22:01:30.686211: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-12-12 22:01:30.688149: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-12 22:01:30.712502: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", - "2023-12-12 22:01:30.712522: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", - "2023-12-12 22:01:30.712540: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", - "2023-12-12 22:01:30.717373: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", - "2023-12-12 22:01:30.718189: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "2023-12-24 07:00:22.754016: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-12-24 07:00:22.756105: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-12-24 07:00:22.781727: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2023-12-24 07:00:22.781748: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2023-12-24 07:00:22.781768: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2023-12-24 07:00:22.787005: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2023-12-24 07:00:22.787908: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-12-12 22:01:31.258810: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + "2023-12-24 07:00:23.453118: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], @@ -176,14 +190,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "37bf49d7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4241d5e2520f4647bc6728dd80c90927", + "model_id": "84b3d28a335c4479b31f1ea88c95c257", "version_major": 2, "version_minor": 0 }, @@ -191,7 +205,7 @@ "Dropdown(description='LLM Model:', options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b', 'mp…" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -213,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "id": "49ea95f8", "metadata": {}, "outputs": [ @@ -221,7 +235,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected LLM model chatglm3-6b\n" + "Selected LLM model zephyr-7b-beta\n" ] } ], @@ -288,14 +302,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "c6a38153", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "329f80b26f2c496fbdc5b6438a6d405a", + "model_id": "4d30072191ce48dab051838b66af7eb1", "version_major": 2, "version_minor": 0 }, @@ -309,7 +323,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b7c42390041c4cebb40808aed3df1de3", + "model_id": "aa20a85b31ba4c03a09eedaa7bcb9917", "version_major": 2, "version_minor": 0 }, @@ -323,7 +337,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6ae4adffa07d4888a88eeb686b0a1229", + "model_id": "354c2cf647db495fb9034b3ab1c756da", "version_major": 2, "version_minor": 0 }, @@ -361,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "2020d522", "metadata": {}, "outputs": [], @@ -537,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "id": "8e127215", "metadata": {}, "outputs": [ @@ -545,7 +559,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Size of FP16 model is 11909.69 MB\n" + "Size of FP16 model is 27657.02 MB\n", + "Size of model with INT4 compressed weights is 5053.39 MB\n", + "Compression rate for INT4 model: 5.473\n" ] } ], @@ -580,22 +596,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "id": "ff80e6eb-7923-40ef-93d8-5e6c56e50667", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f2a58d5b28be4b5284745d65ff60540f", + "model_id": "27d89272c4f84c9fb06ec783816e34f5", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Embedding Model:', options=('all-mpnet-base-v2', 'text2vec-large-chinese'), value='all-m…" + "Dropdown(description='Embedding Model:', options=('all-mpnet-base-v2',), value='all-mpnet-base-v2')" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -618,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "id": "790afcf8", "metadata": {}, "outputs": [ @@ -626,7 +642,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected text2vec-large-chinese model\n" + "Selected all-mpnet-base-v2 model\n" ] } ], @@ -637,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "58d75dad-2eeb-4edd-8d12-d77a365f8eda", "metadata": { "scrolled": true @@ -670,14 +686,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "e11e73cf", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fca81d3e662d4bb7ad99929de50814bd", + "model_id": "27966ce0a9304402975dbe2c86f76e87", "version_major": 2, "version_minor": 0 }, @@ -685,7 +701,7 @@ "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')" ] }, - "execution_count": 13, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -704,10 +720,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "9ab29b85", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embedding model will be loaded to CPU device for response generation\n" + ] + } + ], "source": [ "print(f\"Embedding model will be loaded to {embedding_device.value} device for response generation\")" ] @@ -723,10 +747,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "6d044d01", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "12be7dcd8325436f9467b0474d9d38ac", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "llm_device = widgets.Dropdown(\n", " options=core.available_devices + [\"AUTO\"],\n", @@ -740,10 +780,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "348b90fe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LLM model will be loaded to CPU device for response generation\n" + ] + } + ], "source": [ "print(f\"LLM model will be loaded to {llm_device.value} device for response generation\")" ] @@ -770,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "df3e8fd1-d4c1-4e33-b46e-7840e392f8ee", "metadata": {}, "outputs": [ @@ -817,7 +865,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "efe29701", "metadata": {}, "outputs": [], @@ -827,22 +875,22 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "8b014f24-aa5b-4d40-924d-d579ad7fcec6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "30e4be3151294400b6595e83f69a04ec", + "model_id": "7b3c2991f3284deea331b5806a336d77", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model to run:', options=('FP16',), value='FP16')" + "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -868,24 +916,30 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "f7f708db-8de1-4efd-94b2-fcabc48d52f4", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "The argument `trust_remote_code` is to be used along with export=True. It will be ignored.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Loading model from chatglm3-6b/FP16\n" + "Loading model from zephyr-7b-beta/INT4_compressed_weights\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "The argument `trust_remote_code` is to be used along with export=True. It will be ignored.\n", - "Compiling the model to CPU ...\n", - "Setting OpenVINO CACHE_DIR to chatglm3-6b/FP16/model_cache\n" + "Compiling the model to CPU ...\n" ] } ], @@ -949,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "6040e0d0", "metadata": {}, "outputs": [], @@ -962,6 +1016,11 @@ " tokenizer=tok,\n", " max_new_tokens=256,\n", " streamer=streamer,\n", + " # temperature=1,\n", + " # do_sample=True,\n", + " # top_p=0.8,\n", + " # top_k=20,\n", + " # repetition_penalty=1.1,\n", ")\n", "if stop_tokens is not None:\n", " generate_kwargs[\"stopping_criteria\"] = StoppingCriteriaList(stop_tokens)\n", @@ -1005,6 +1064,72 @@ { "cell_type": "code", "execution_count": 19, + "id": "5b97eeeb", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter\n", + "from langchain.document_loaders import (\n", + " CSVLoader,\n", + " EverNoteLoader,\n", + " PDFMinerLoader,\n", + " TextLoader,\n", + " UnstructuredEPubLoader,\n", + " UnstructuredHTMLLoader,\n", + " UnstructuredMarkdownLoader,\n", + " UnstructuredODTLoader,\n", + " UnstructuredPowerPointLoader,\n", + " UnstructuredWordDocumentLoader, )\n", + "\n", + "\n", + "class ChineseTextSplitter(CharacterTextSplitter):\n", + " def __init__(self, pdf: bool = False, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.pdf = pdf\n", + "\n", + " def split_text(self, text: str) -> List[str]:\n", + " if self.pdf:\n", + " text = re.sub(r\"\\n{3,}\", \"\\n\", text)\n", + " text = text.replace(\"\\n\\n\", \"\")\n", + " sent_sep_pattern = re.compile(\n", + " '([﹒﹔﹖﹗.。!?][\"’”」』]{0,2}|(?=[\"‘“「『]{1,2}|$))')\n", + " sent_list = []\n", + " for ele in sent_sep_pattern.split(text):\n", + " if sent_sep_pattern.match(ele) and sent_list:\n", + " sent_list[-1] += ele\n", + " elif ele:\n", + " sent_list.append(ele)\n", + " return sent_list\n", + "\n", + "\n", + "TEXT_SPLITERS = {\n", + " \"Character\": CharacterTextSplitter,\n", + " \"RecursiveCharacter\": RecursiveCharacterTextSplitter,\n", + " \"Markdown\": MarkdownTextSplitter,\n", + " \"Chinese\": ChineseTextSplitter,\n", + "}\n", + "\n", + "\n", + "LOADERS = {\n", + " \".csv\": (CSVLoader, {}),\n", + " \".doc\": (UnstructuredWordDocumentLoader, {}),\n", + " \".docx\": (UnstructuredWordDocumentLoader, {}),\n", + " \".enex\": (EverNoteLoader, {}),\n", + " \".epub\": (UnstructuredEPubLoader, {}),\n", + " \".html\": (UnstructuredHTMLLoader, {}),\n", + " \".md\": (UnstructuredMarkdownLoader, {}),\n", + " \".odt\": (UnstructuredODTLoader, {}),\n", + " \".pdf\": (PDFMinerLoader, {}),\n", + " \".ppt\": (UnstructuredPowerPointLoader, {}),\n", + " \".pptx\": (UnstructuredPowerPointLoader, {}),\n", + " \".txt\": (TextLoader, {\"encoding\": \"utf8\"}),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "id": "0908e5e9-4dcb-4fc8-8480-3cf70fd5e934", "metadata": {}, "outputs": [ @@ -1012,7 +1137,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running on local URL: http://10.3.233.70:7868\n", + "Running on local URL: http://10.3.233.70:4888\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] @@ -1020,7 +1145,7 @@ { "data": { "text/html": [ - "
" + "
" ], "text/plain": [ "" @@ -1033,7 +1158,7 @@ "data": { "text/plain": [] }, - "execution_count": 19, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" }, @@ -1041,9 +1166,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", " self.request.start_async(inputs, shared_memory=True)\n", - "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", + " self.request.start_async(inputs, shared_memory=True)\n", + "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n", " self.request.start_async(inputs, shared_memory=True)\n" ] } @@ -1051,19 +1178,36 @@ "source": [ "from langchain.prompts import PromptTemplate\n", "from langchain.vectorstores import Chroma\n", - "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.chains import RetrievalQA\n", - "from langchain.document_loaders import UnstructuredMarkdownLoader, CSVLoader, TextLoader\n", - "from typing import List\n", + "from langchain.docstore.document import Document\n", "from threading import Event, Thread\n", "import gradio as gr\n", "import re\n", "from uuid import uuid4\n", "\n", "\n", + "def load_single_document(file_path: str) -> List[Document]:\n", + " \"\"\"\n", + " helper for loading a single document\n", + "\n", + " Params:\n", + " file_path: document path\n", + " Returns:\n", + " documents loaded\n", + "\n", + " \"\"\"\n", + " ext = \".\" + file_path.rsplit(\".\", 1)[-1]\n", + " if ext in LOADERS:\n", + " loader_class, loader_args = LOADERS[ext]\n", + " loader = loader_class(file_path, **loader_args)\n", + " return loader.load()\n", + "\n", + " raise ValueError(f\"File does not exist '{ext}'\")\n", + "\n", + "\n", "def default_partial_text_processor(partial_text: str, new_text: str):\n", " \"\"\"\n", - " helper for updating partially generated answer, used by de\n", + " helper for updating partially generated answer, used by default\n", "\n", " Params:\n", " partial_text: text buffer for storing previosly generated text\n", @@ -1081,22 +1225,7 @@ ")\n", "\n", "\n", - "class ChineseTextSplitter(CharacterTextSplitter):\n", - " def __init__(self, pdf: bool = False, **kwargs):\n", - " super().__init__(**kwargs)\n", - "\n", - " def split_text(self, text: str) -> List[str]:\n", - " sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?][\"’”」』]{0,2}|(?=[\"‘“「『]{1,2}|$))')\n", - " sent_list = []\n", - " for ele in sent_sep_pattern.split(text):\n", - " if sent_sep_pattern.match(ele) and sent_list:\n", - " sent_list[-1] += ele\n", - " elif ele:\n", - " sent_list.append(ele)\n", - " return sent_list\n", - "\n", - "\n", - "def build_chain(doc, chunk_size, chunk_overlap, vector_search_top_k):\n", + "def build_chain(docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k):\n", " \"\"\"\n", " Initialize a QA chain\n", "\n", @@ -1107,24 +1236,13 @@ " vector_search_top_k: Vector search top k\n", "\n", " \"\"\"\n", - " \n", - " if doc.name.lower().endswith(\".md\"):\n", - " loader = UnstructuredMarkdownLoader(doc.name)\n", - " elif doc.name.lower().endswith(\".csv\"):\n", - " loader = CSVLoader(doc.name)\n", - " else:\n", - " loader = TextLoader(doc.name)\n", - " \n", - " documents = loader.load()\n", + " documents = []\n", + " for doc in docs:\n", + " documents.extend(load_single_document(doc.name))\n", "\n", - " if \"qwen\" in llm_model_id.value or \"chatglm\" in llm_model_id.value:\n", - " text_splitter = ChineseTextSplitter(\n", - " chunk_size=chunk_size, chunk_overlap=chunk_overlap\n", - " )\n", - " else:\n", - " text_splitter = CharacterTextSplitter(\n", - " chunk_size=chunk_size, chunk_overlap=chunk_overlap\n", - " )\n", + " text_splitter = TEXT_SPLITERS[spliter_name](\n", + " chunk_size=chunk_size, chunk_overlap=chunk_overlap\n", + " )\n", "\n", " texts = text_splitter.split_documents(documents)\n", "\n", @@ -1155,7 +1273,7 @@ " None\n", " \"\"\"\n", " # Append the user's message to the conversation history\n", - " return \"\", history + [(message, None)]\n", + " return \"\", history + [[message, \"\"]]\n", "\n", "\n", "def bot(history, conversation_id):\n", @@ -1163,12 +1281,7 @@ " callback function for running chatbot on submit button click\n", "\n", " Params:\n", - " history: conversation history\n", - " temperature: parameter for control the level of creativity in AI-generated text.\n", - " By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.\n", - " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", - " top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability.\n", - " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " history: conversation history.\n", " conversation_id: unique conversation identifier.\n", "\n", " \"\"\"\n", @@ -1180,6 +1293,8 @@ "\n", " t1 = Thread(target=infer, args=(history[-1][0],))\n", " t1.start()\n", + "\n", + " # Initialize an empty string to store the generated text\n", " partial_text = \"\"\n", " for new_text in streamer:\n", " partial_text = text_processor(partial_text, new_text)\n", @@ -1203,10 +1318,35 @@ " gr.Markdown(f\"\"\"
Powered by OpenVINO and {llm_model_id.value}
\"\"\")\n", " with gr.Row():\n", " with gr.Column(scale=1):\n", - " docs = gr.File(label=\"Load a Markdown/CSV file\", file_types=[\".md\", \".csv\"])\n", + " docs = gr.File(\n", + " label=\"Load text files\",\n", + " file_count=\"multiple\",\n", + " file_types=[\n", + " \".csv\",\n", + " \".doc\",\n", + " \".docx\",\n", + " \".enex\",\n", + " \".epub\",\n", + " \".html\",\n", + " \".md\",\n", + " \".odt\",\n", + " \".pdf\",\n", + " \".ppt\",\n", + " \".pptx\",\n", + " \".txt\",\n", + " ],\n", + " )\n", " load_docs = gr.Button(\"Build Retriever\")\n", - " retriever_argument = gr.Accordion(\"Retriever Configuration\")\n", + " retriever_argument = gr.Accordion(\"Retriever Configuration\", open=False)\n", " with retriever_argument:\n", + " spliter = gr.Dropdown(\n", + " [\"Character\", \"RecursiveCharacter\", \"Markdown\", \"Chinese\"],\n", + " value=\"RecursiveCharacter\",\n", + " label=\"Text Spliter\",\n", + " info=\"Method used to splite the documents\",\n", + " multiselect=False,\n", + " )\n", + "\n", " chunk_size = gr.Slider(\n", " label=\"Chunk size\",\n", " value=1000,\n", @@ -1251,27 +1391,19 @@ " with gr.Column():\n", " with gr.Row():\n", " submit = gr.Button(\"Submit\")\n", - " stop = gr.Button(\"Stop\")\n", " clear = gr.Button(\"Clear\")\n", " load_docs.click(\n", " build_chain,\n", - " inputs=[docs, chunk_size, chunk_overlap, vector_search_top_k],\n", + " inputs=[docs, spliter, chunk_size, chunk_overlap, vector_search_top_k],\n", " outputs=[langchain_status],\n", " queue=False,\n", " )\n", - " submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(\n", - " bot, [chatbot, conversation_id], chatbot, queue=True\n", - " )\n", + " submit_event = msg.submit(\n", + " user, [msg, chatbot], [msg, chatbot], queue=False, trigger_mode=\"once\"\n", + " ).then(bot, [chatbot, conversation_id], chatbot, queue=True)\n", " submit_click_event = submit.click(\n", - " user, [msg, chatbot], [msg, chatbot], queue=False\n", + " user, [msg, chatbot], [msg, chatbot], queue=False, trigger_mode=\"once\"\n", " ).then(bot, [chatbot, conversation_id], chatbot, queue=True)\n", - " stop.click(\n", - " fn=None,\n", - " inputs=None,\n", - " outputs=None,\n", - " cancels=[submit_event, submit_click_event],\n", - " queue=False,\n", - " )\n", " clear.click(lambda: None, None, chatbot, queue=False)\n", "\n", "demo.queue(max_size=2)\n", @@ -1285,7 +1417,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 28, "id": "6f4b5a84-bebf-49b9-b2fa-5e788ed2cbac", "metadata": {}, "outputs": [ @@ -1293,7 +1425,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Closing server running on port: 7868\n" + "Closing server running on port: 4888\n" ] } ], diff --git a/notebooks/254-llm-chatbot/config.py b/notebooks/254-llm-chatbot/config.py index 5543bfbc9a9..cb0ecb71a83 100644 --- a/notebooks/254-llm-chatbot/config.py +++ b/notebooks/254-llm-chatbot/config.py @@ -127,19 +127,18 @@ def youri_partial_text_processor(partial_text, new_text): "chatglm3-6b": { "model_id": "THUDM/chatglm3-6b", "remote": True, - "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT }\n", - "history_template": "<|user|>\n{user}\n<|assistant|>\n{assistant}\n", + "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n", + "history_template": "<|user|>\n{user} \n<|assistant|>\n{assistant} \n", "partial_text_processor": chatglm_partial_text_processor, - "current_message_template": "<|user|>\n{user}\n<|assistant|>\n", + "current_message_template": "<|user|>\n{user} \n<|assistant|>\n{assistant}", "tokenizer_kwargs": {"add_special_tokens": False}, - "stop_tokens": ["", "[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"], - "prompt_template": f"""<|system|> - {DEFAULT_RAG_PROMPT_CHINESE }""" + "stop_tokens": [""], + "prompt_template": f"""<|system|> {DEFAULT_RAG_PROMPT_CHINESE }""" + """ <|user|> 问题: {question} 已知内容: {context} - 回答: + 回答: <|assistant|>""", }, "mistral-7b": {