From c6a92b244dfc0c8d53f16ac11af9872cb6b2cfc9 Mon Sep 17 00:00:00 2001
From: Ethan Yang <ethan.yang@intel.com>
Date: Mon, 25 Dec 2023 12:50:51 +0800
Subject: [PATCH] [254-rag-chatbot]update the text spliter and document loader
 (#1562)

* to support more text spliter and date loader

to support more text spliter and date loader

to support more text spliter and date loader

to support more text spliter and date loader

to support more text spliter and date loader

to support more text spliter and date loader

* fix qwen promopt tempalte issue

fix qwen promopt tempalte issue

fix qwen promopt tempalte issue

* support multiple doucuments upload

support multiple doucuments upload
---
 .../254-llm-chatbot/254-llm-chatbot.ipynb     | 113 ++++--
 .../254-llm-chatbot/254-rag-chatbot.ipynb     | 368 ++++++++++++------
 notebooks/254-llm-chatbot/config.py           |  13 +-
 3 files changed, 335 insertions(+), 159 deletions(-)

diff --git a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
index 1d6eab48aea..91a3aec6099 100644
--- a/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb
@@ -129,7 +129,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "169978da6d5942bd8146676f3bf5db8b",
+       "model_id": "1e8ca46ac6734f8c816a14cbe46964ce",
        "version_major": 2,
        "version_minor": 0
       },
@@ -223,15 +223,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-12-12 21:55:41.474562: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-12-12 21:55:41.476575: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-12-12 21:55:41.501573: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2023-12-12 21:55:41.501593: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2023-12-12 21:55:41.501613: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2023-12-12 21:55:41.506678: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-12-12 21:55:41.507421: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2023-12-21 21:33:05.855788: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-12-21 21:33:05.857870: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2023-12-21 21:33:05.883126: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-12-21 21:33:05.883147: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-12-21 21:33:05.883167: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-12-21 21:33:05.888388: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2023-12-21 21:33:05.889023: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-12-12 21:55:42.037050: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+      "2023-12-21 21:33:06.449452: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
@@ -292,7 +292,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "671a0e89103d496eb4cdce5f6c70904a",
+       "model_id": "c802a1fb556c4abdb38b967c02ef3ef6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -306,7 +306,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f6376de7c87145f2a00de1c3e6edf4c6",
+       "model_id": "43b1bd84b5ef4fb0b015411fa3edc862",
        "version_major": 2,
        "version_minor": 0
       },
@@ -320,7 +320,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1126a5495fb5426588da00e518586bde",
+       "model_id": "ec15e0c8aaa54fc080d9d8d8938c233a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -593,7 +593,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "id": "837b4a3b-ccc3-4004-9577-2b2c7b802dea",
    "metadata": {
     "tags": []
@@ -602,15 +602,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4af32a190c7a4896a06743fe05c7b56b",
+       "model_id": "54ae70217dbd4299974e24aae599957e",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Device:', options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='CPU')"
+       "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -637,7 +637,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "5333ab9b-ff5d-4a7f-bcdc-9cca5d56dc0a",
    "metadata": {
     "tags": []
@@ -659,7 +659,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "3536a1a7",
    "metadata": {
     "collapsed": false,
@@ -671,15 +671,15 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0f954bac863d48f5ab0b9eb779f0a82d",
+       "model_id": "03dc64b5e12e4fb79fb36a63ffef2ef2",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')"
+       "Dropdown(description='Model to run:', options=('FP16',), value='FP16')"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -705,7 +705,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "id": "7a041101-7336-40fd-96c9-cd298015a0f3",
    "metadata": {
     "tags": []
@@ -715,7 +715,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading model from chatglm2-6b/INT4_compressed_weights\n"
+      "Loading model from chatglm3-6b/FP16\n"
      ]
     },
     {
@@ -760,7 +760,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "8f6f7596-5677-4931-875b-aaabfa23cabc",
    "metadata": {},
    "outputs": [
@@ -768,7 +768,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ea/work/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
       "  self.request.start_async(inputs, shared_memory=True)\n"
      ]
     },
@@ -833,12 +833,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "01f8f7f8-072e-45dc-b7c9-18d8c3c47754",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://10.3.233.70:4768\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://10.3.233.70:4768/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "  self.request.start_async(inputs, shared_memory=True)\n"
+     ]
+    }
+   ],
    "source": [
     "from threading import Event, Thread\n",
     "from uuid import uuid4\n",
@@ -922,7 +960,7 @@
     "\n",
     "def default_partial_text_processor(partial_text: str, new_text: str):\n",
     "    \"\"\"\n",
-    "    helper for updating partially generated answer, used by de\n",
+    "    helper for updating partially generated answer, used by default\n",
     "\n",
     "    Params:\n",
     "      partial_text: text buffer for storing previosly generated text\n",
@@ -972,7 +1010,7 @@
     "    return text\n",
     "\n",
     "\n",
-    "def user(text, history):\n",
+    "def user(message, history):\n",
     "    \"\"\"\n",
     "    callback function for updating user messages in interface on submit button click\n",
     "\n",
@@ -983,8 +1021,7 @@
     "      None\n",
     "    \"\"\"\n",
     "    # Append the user's message to the conversation history\n",
-    "    history = history + [(text, None)]\n",
-    "    return \"\", history\n",
+    "    return \"\", history + [[message, \"\"]]\n",
     "\n",
     "\n",
     "def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):\n",
@@ -1184,13 +1221,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "7b837f9e-4152-4a5c-880a-ed874aa64a74",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Closing server running on port: 4768\n"
+     ]
+    }
+   ],
    "source": [
     "# please run this cell for stopping gradio interface\n",
-    "# demo.close()"
+    "demo.close()"
    ]
   }
  ],
diff --git a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
index 4628c583daf..f6d1def9712 100644
--- a/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
+++ b/notebooks/254-llm-chatbot/254-rag-chatbot.ipynb
@@ -51,10 +51,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "id": "1f077b32-5d36-44b0-9041-407e996283a3",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: Skipping openvino-dev as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\u001b[33mWARNING: Skipping openvino as it is not installed.\u001b[0m\u001b[33m\n",
+      "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.2\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "%pip uninstall -q -y openvino-dev openvino openvino-nightly\n",
     "%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\\\n",
@@ -62,7 +76,7 @@
     "\"nncf>=2.7\"\\\n",
     "\"openvino-nightly\"\\\n",
     "\"gradio\"\\\n",
-    "\"onnx\" \"chromadb\" \"sentence_transformers\" \"langchain\" \"langchainhub\" \"transformers>=4.34.0\" \"unstructured\" \"scikit-learn\""
+    "\"onnx\" \"chromadb\" \"sentence_transformers\" \"langchain\" \"langchainhub\" \"transformers>=4.34.0\" \"unstructured\" \"scikit-learn\" \"python-docx\""
    ]
   },
   {
@@ -112,7 +126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "id": "d3b57cfb-e727-43a5-b2c9-8f1b1ba72061",
    "metadata": {},
    "outputs": [
@@ -127,15 +141,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-12-12 22:01:30.686211: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-12-12 22:01:30.688149: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-12-12 22:01:30.712502: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2023-12-12 22:01:30.712522: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2023-12-12 22:01:30.712540: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2023-12-12 22:01:30.717373: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
-      "2023-12-12 22:01:30.718189: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2023-12-24 07:00:22.754016: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2023-12-24 07:00:22.756105: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2023-12-24 07:00:22.781727: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2023-12-24 07:00:22.781748: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2023-12-24 07:00:22.781768: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2023-12-24 07:00:22.787005: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2023-12-24 07:00:22.787908: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2023-12-12 22:01:31.258810: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
+      "2023-12-24 07:00:23.453118: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
      ]
     }
    ],
@@ -176,14 +190,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "id": "37bf49d7",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4241d5e2520f4647bc6728dd80c90927",
+       "model_id": "84b3d28a335c4479b31f1ea88c95c257",
        "version_major": 2,
        "version_minor": 0
       },
@@ -191,7 +205,7 @@
        "Dropdown(description='LLM Model:', options=('tiny-llama-1b-chat', 'red-pajama-3b-chat', 'llama-2-chat-7b', 'mp…"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -213,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "id": "49ea95f8",
    "metadata": {},
    "outputs": [
@@ -221,7 +235,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected LLM model chatglm3-6b\n"
+      "Selected LLM model zephyr-7b-beta\n"
      ]
     }
    ],
@@ -288,14 +302,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "id": "c6a38153",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "329f80b26f2c496fbdc5b6438a6d405a",
+       "model_id": "4d30072191ce48dab051838b66af7eb1",
        "version_major": 2,
        "version_minor": 0
       },
@@ -309,7 +323,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b7c42390041c4cebb40808aed3df1de3",
+       "model_id": "aa20a85b31ba4c03a09eedaa7bcb9917",
        "version_major": 2,
        "version_minor": 0
       },
@@ -323,7 +337,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6ae4adffa07d4888a88eeb686b0a1229",
+       "model_id": "354c2cf647db495fb9034b3ab1c756da",
        "version_major": 2,
        "version_minor": 0
       },
@@ -361,7 +375,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "2020d522",
    "metadata": {},
    "outputs": [],
@@ -537,7 +551,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "id": "8e127215",
    "metadata": {},
    "outputs": [
@@ -545,7 +559,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Size of FP16 model is 11909.69 MB\n"
+      "Size of FP16 model is 27657.02 MB\n",
+      "Size of model with INT4 compressed weights is 5053.39 MB\n",
+      "Compression rate for INT4 model: 5.473\n"
      ]
     }
    ],
@@ -580,22 +596,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "id": "ff80e6eb-7923-40ef-93d8-5e6c56e50667",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f2a58d5b28be4b5284745d65ff60540f",
+       "model_id": "27d89272c4f84c9fb06ec783816e34f5",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Embedding Model:', options=('all-mpnet-base-v2', 'text2vec-large-chinese'), value='all-m…"
+       "Dropdown(description='Embedding Model:', options=('all-mpnet-base-v2',), value='all-mpnet-base-v2')"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -618,7 +634,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
    "id": "790afcf8",
    "metadata": {},
    "outputs": [
@@ -626,7 +642,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Selected text2vec-large-chinese model\n"
+      "Selected all-mpnet-base-v2 model\n"
      ]
     }
    ],
@@ -637,7 +653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "id": "58d75dad-2eeb-4edd-8d12-d77a365f8eda",
    "metadata": {
     "scrolled": true
@@ -670,14 +686,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "id": "e11e73cf",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fca81d3e662d4bb7ad99929de50814bd",
+       "model_id": "27966ce0a9304402975dbe2c86f76e87",
        "version_major": 2,
        "version_minor": 0
       },
@@ -685,7 +701,7 @@
        "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -704,10 +720,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "9ab29b85",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Embedding model will be loaded to CPU device for response generation\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"Embedding model will be loaded to {embedding_device.value} device for response generation\")"
    ]
@@ -723,10 +747,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "6d044d01",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "12be7dcd8325436f9467b0474d9d38ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "llm_device = widgets.Dropdown(\n",
     "    options=core.available_devices + [\"AUTO\"],\n",
@@ -740,10 +780,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "348b90fe",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLM model will be loaded to CPU device for response generation\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"LLM model will be loaded to {llm_device.value} device for response generation\")"
    ]
@@ -770,7 +818,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "id": "df3e8fd1-d4c1-4e33-b46e-7840e392f8ee",
    "metadata": {},
    "outputs": [
@@ -817,7 +865,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "efe29701",
    "metadata": {},
    "outputs": [],
@@ -827,22 +875,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "id": "8b014f24-aa5b-4d40-924d-d579ad7fcec6",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "30e4be3151294400b6595e83f69a04ec",
+       "model_id": "7b3c2991f3284deea331b5806a336d77",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Dropdown(description='Model to run:', options=('FP16',), value='FP16')"
+       "Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -868,24 +916,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "id": "f7f708db-8de1-4efd-94b2-fcabc48d52f4",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "The argument `trust_remote_code` is to be used along with export=True. It will be ignored.\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Loading model from chatglm3-6b/FP16\n"
+      "Loading model from zephyr-7b-beta/INT4_compressed_weights\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "The argument `trust_remote_code` is to be used along with export=True. It will be ignored.\n",
-      "Compiling the model to CPU ...\n",
-      "Setting OpenVINO CACHE_DIR to chatglm3-6b/FP16/model_cache\n"
+      "Compiling the model to CPU ...\n"
      ]
     }
    ],
@@ -949,7 +1003,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "id": "6040e0d0",
    "metadata": {},
    "outputs": [],
@@ -962,6 +1016,11 @@
     "    tokenizer=tok,\n",
     "    max_new_tokens=256,\n",
     "    streamer=streamer,\n",
+    "    # temperature=1,\n",
+    "    # do_sample=True,\n",
+    "    # top_p=0.8,\n",
+    "    # top_k=20,\n",
+    "    # repetition_penalty=1.1,\n",
     ")\n",
     "if stop_tokens is not None:\n",
     "    generate_kwargs[\"stopping_criteria\"] = StoppingCriteriaList(stop_tokens)\n",
@@ -1005,6 +1064,72 @@
   {
    "cell_type": "code",
    "execution_count": 19,
+   "id": "5b97eeeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter\n",
+    "from langchain.document_loaders import (\n",
+    "    CSVLoader,\n",
+    "    EverNoteLoader,\n",
+    "    PDFMinerLoader,\n",
+    "    TextLoader,\n",
+    "    UnstructuredEPubLoader,\n",
+    "    UnstructuredHTMLLoader,\n",
+    "    UnstructuredMarkdownLoader,\n",
+    "    UnstructuredODTLoader,\n",
+    "    UnstructuredPowerPointLoader,\n",
+    "    UnstructuredWordDocumentLoader, )\n",
+    "\n",
+    "\n",
+    "class ChineseTextSplitter(CharacterTextSplitter):\n",
+    "    def __init__(self, pdf: bool = False, **kwargs):\n",
+    "        super().__init__(**kwargs)\n",
+    "        self.pdf = pdf\n",
+    "\n",
+    "    def split_text(self, text: str) -> List[str]:\n",
+    "        if self.pdf:\n",
+    "            text = re.sub(r\"\\n{3,}\", \"\\n\", text)\n",
+    "            text = text.replace(\"\\n\\n\", \"\")\n",
+    "        sent_sep_pattern = re.compile(\n",
+    "            '([﹒﹔﹖﹗．。！？][\"’”」』]{0,2}|(?=[\"‘“「『]{1,2}|$))')\n",
+    "        sent_list = []\n",
+    "        for ele in sent_sep_pattern.split(text):\n",
+    "            if sent_sep_pattern.match(ele) and sent_list:\n",
+    "                sent_list[-1] += ele\n",
+    "            elif ele:\n",
+    "                sent_list.append(ele)\n",
+    "        return sent_list\n",
+    "\n",
+    "\n",
+    "TEXT_SPLITERS = {\n",
+    "    \"Character\": CharacterTextSplitter,\n",
+    "    \"RecursiveCharacter\": RecursiveCharacterTextSplitter,\n",
+    "    \"Markdown\": MarkdownTextSplitter,\n",
+    "    \"Chinese\": ChineseTextSplitter,\n",
+    "}\n",
+    "\n",
+    "\n",
+    "LOADERS = {\n",
+    "    \".csv\": (CSVLoader, {}),\n",
+    "    \".doc\": (UnstructuredWordDocumentLoader, {}),\n",
+    "    \".docx\": (UnstructuredWordDocumentLoader, {}),\n",
+    "    \".enex\": (EverNoteLoader, {}),\n",
+    "    \".epub\": (UnstructuredEPubLoader, {}),\n",
+    "    \".html\": (UnstructuredHTMLLoader, {}),\n",
+    "    \".md\": (UnstructuredMarkdownLoader, {}),\n",
+    "    \".odt\": (UnstructuredODTLoader, {}),\n",
+    "    \".pdf\": (PDFMinerLoader, {}),\n",
+    "    \".ppt\": (UnstructuredPowerPointLoader, {}),\n",
+    "    \".pptx\": (UnstructuredPowerPointLoader, {}),\n",
+    "    \".txt\": (TextLoader, {\"encoding\": \"utf8\"}),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
    "id": "0908e5e9-4dcb-4fc8-8480-3cf70fd5e934",
    "metadata": {},
    "outputs": [
@@ -1012,7 +1137,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running on local URL:  http://10.3.233.70:7868\n",
+      "Running on local URL:  http://10.3.233.70:4888\n",
       "\n",
       "To create a public link, set `share=True` in `launch()`.\n"
      ]
@@ -1020,7 +1145,7 @@
     {
      "data": {
       "text/html": [
-       "<div><iframe src=\"http://10.3.233.70:7868/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+       "<div><iframe src=\"http://10.3.233.70:4888/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -1033,7 +1158,7 @@
      "data": {
       "text/plain": []
      },
-     "execution_count": 19,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -1041,9 +1166,11 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
       "  self.request.start_async(inputs, shared_memory=True)\n",
-      "/home/ethan/intel/openvino_notebooks/notebooks/254-llm-chatbot/ov_llm_model.py:400: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
+      "  self.request.start_async(inputs, shared_memory=True)\n",
+      "/home/ethan/intel/openvino_notebooks/openvino_env/lib/python3.10/site-packages/optimum/intel/openvino/modeling_decoder.py:388: FutureWarning: `shared_memory` is deprecated and will be removed in 2024.0. Value of `shared_memory` is going to override `share_inputs` value. Please use only `share_inputs` explicitly.\n",
       "  self.request.start_async(inputs, shared_memory=True)\n"
      ]
     }
@@ -1051,19 +1178,36 @@
    "source": [
     "from langchain.prompts import PromptTemplate\n",
     "from langchain.vectorstores import Chroma\n",
-    "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.chains import RetrievalQA\n",
-    "from langchain.document_loaders import UnstructuredMarkdownLoader, CSVLoader, TextLoader\n",
-    "from typing import List\n",
+    "from langchain.docstore.document import Document\n",
     "from threading import Event, Thread\n",
     "import gradio as gr\n",
     "import re\n",
     "from uuid import uuid4\n",
     "\n",
     "\n",
+    "def load_single_document(file_path: str) -> List[Document]:\n",
+    "    \"\"\"\n",
+    "    helper for loading a single document\n",
+    "\n",
+    "    Params:\n",
+    "      file_path: document path\n",
+    "    Returns:\n",
+    "      documents loaded\n",
+    "\n",
+    "    \"\"\"\n",
+    "    ext = \".\" + file_path.rsplit(\".\", 1)[-1]\n",
+    "    if ext in LOADERS:\n",
+    "        loader_class, loader_args = LOADERS[ext]\n",
+    "        loader = loader_class(file_path, **loader_args)\n",
+    "        return loader.load()\n",
+    "\n",
+    "    raise ValueError(f\"File does not exist '{ext}'\")\n",
+    "\n",
+    "\n",
     "def default_partial_text_processor(partial_text: str, new_text: str):\n",
     "    \"\"\"\n",
-    "    helper for updating partially generated answer, used by de\n",
+    "    helper for updating partially generated answer, used by default\n",
     "\n",
     "    Params:\n",
     "      partial_text: text buffer for storing previosly generated text\n",
@@ -1081,22 +1225,7 @@
     ")\n",
     "\n",
     "\n",
-    "class ChineseTextSplitter(CharacterTextSplitter):\n",
-    "    def __init__(self, pdf: bool = False, **kwargs):\n",
-    "        super().__init__(**kwargs)\n",
-    "\n",
-    "    def split_text(self, text: str) -> List[str]:\n",
-    "        sent_sep_pattern = re.compile('([﹒﹔﹖﹗．。！？][\"’”」』]{0,2}|(?=[\"‘“「『]{1,2}|$))')\n",
-    "        sent_list = []\n",
-    "        for ele in sent_sep_pattern.split(text):\n",
-    "            if sent_sep_pattern.match(ele) and sent_list:\n",
-    "                sent_list[-1] += ele\n",
-    "            elif ele:\n",
-    "                sent_list.append(ele)\n",
-    "        return sent_list\n",
-    "\n",
-    "\n",
-    "def build_chain(doc, chunk_size, chunk_overlap, vector_search_top_k):\n",
+    "def build_chain(docs, spliter_name, chunk_size, chunk_overlap, vector_search_top_k):\n",
     "    \"\"\"\n",
     "    Initialize a QA chain\n",
     "\n",
@@ -1107,24 +1236,13 @@
     "      vector_search_top_k: Vector search top k\n",
     "\n",
     "    \"\"\"\n",
-    "    \n",
-    "    if doc.name.lower().endswith(\".md\"):\n",
-    "        loader = UnstructuredMarkdownLoader(doc.name)\n",
-    "    elif doc.name.lower().endswith(\".csv\"):\n",
-    "        loader = CSVLoader(doc.name)\n",
-    "    else:\n",
-    "        loader = TextLoader(doc.name)\n",
-    "        \n",
-    "    documents = loader.load()\n",
+    "    documents = []\n",
+    "    for doc in docs:\n",
+    "        documents.extend(load_single_document(doc.name))\n",
     "\n",
-    "    if \"qwen\" in llm_model_id.value or \"chatglm\" in llm_model_id.value:\n",
-    "        text_splitter = ChineseTextSplitter(\n",
-    "            chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
-    "        )\n",
-    "    else:\n",
-    "        text_splitter = CharacterTextSplitter(\n",
-    "            chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
-    "        )\n",
+    "    text_splitter = TEXT_SPLITERS[spliter_name](\n",
+    "        chunk_size=chunk_size, chunk_overlap=chunk_overlap\n",
+    "    )\n",
     "\n",
     "    texts = text_splitter.split_documents(documents)\n",
     "\n",
@@ -1155,7 +1273,7 @@
     "      None\n",
     "    \"\"\"\n",
     "    # Append the user's message to the conversation history\n",
-    "    return \"\", history + [(message, None)]\n",
+    "    return \"\", history + [[message, \"\"]]\n",
     "\n",
     "\n",
     "def bot(history, conversation_id):\n",
@@ -1163,12 +1281,7 @@
     "    callback function for running chatbot on submit button click\n",
     "\n",
     "    Params:\n",
-    "      history: conversation history\n",
-    "      temperature:  parameter for control the level of creativity in AI-generated text.\n",
-    "                    By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.\n",
-    "      top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n",
-    "      top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability.\n",
-    "      repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n",
+    "      history: conversation history.\n",
     "      conversation_id: unique conversation identifier.\n",
     "\n",
     "    \"\"\"\n",
@@ -1180,6 +1293,8 @@
     "\n",
     "    t1 = Thread(target=infer, args=(history[-1][0],))\n",
     "    t1.start()\n",
+    "\n",
+    "    # Initialize an empty string to store the generated text\n",
     "    partial_text = \"\"\n",
     "    for new_text in streamer:\n",
     "        partial_text = text_processor(partial_text, new_text)\n",
@@ -1203,10 +1318,35 @@
     "    gr.Markdown(f\"\"\"<center>Powered by OpenVINO and {llm_model_id.value} </center>\"\"\")\n",
     "    with gr.Row():\n",
     "        with gr.Column(scale=1):\n",
-    "            docs = gr.File(label=\"Load a Markdown/CSV file\", file_types=[\".md\", \".csv\"])\n",
+    "            docs = gr.File(\n",
+    "                label=\"Load text files\",\n",
+    "                file_count=\"multiple\",\n",
+    "                file_types=[\n",
+    "                    \".csv\",\n",
+    "                    \".doc\",\n",
+    "                    \".docx\",\n",
+    "                    \".enex\",\n",
+    "                    \".epub\",\n",
+    "                    \".html\",\n",
+    "                    \".md\",\n",
+    "                    \".odt\",\n",
+    "                    \".pdf\",\n",
+    "                    \".ppt\",\n",
+    "                    \".pptx\",\n",
+    "                    \".txt\",\n",
+    "                ],\n",
+    "            )\n",
     "            load_docs = gr.Button(\"Build Retriever\")\n",
-    "            retriever_argument = gr.Accordion(\"Retriever Configuration\")\n",
+    "            retriever_argument = gr.Accordion(\"Retriever Configuration\", open=False)\n",
     "            with retriever_argument:\n",
+    "                spliter = gr.Dropdown(\n",
+    "                    [\"Character\", \"RecursiveCharacter\", \"Markdown\", \"Chinese\"],\n",
+    "                    value=\"RecursiveCharacter\",\n",
+    "                    label=\"Text Spliter\",\n",
+    "                    info=\"Method used to splite the documents\",\n",
+    "                    multiselect=False,\n",
+    "                )\n",
+    "\n",
     "                chunk_size = gr.Slider(\n",
     "                    label=\"Chunk size\",\n",
     "                    value=1000,\n",
@@ -1251,27 +1391,19 @@
     "                with gr.Column():\n",
     "                    with gr.Row():\n",
     "                        submit = gr.Button(\"Submit\")\n",
-    "                        stop = gr.Button(\"Stop\")\n",
     "                        clear = gr.Button(\"Clear\")\n",
     "    load_docs.click(\n",
     "        build_chain,\n",
-    "        inputs=[docs, chunk_size, chunk_overlap, vector_search_top_k],\n",
+    "        inputs=[docs, spliter, chunk_size, chunk_overlap, vector_search_top_k],\n",
     "        outputs=[langchain_status],\n",
     "        queue=False,\n",
     "    )\n",
-    "    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(\n",
-    "        bot, [chatbot, conversation_id], chatbot, queue=True\n",
-    "    )\n",
+    "    submit_event = msg.submit(\n",
+    "        user, [msg, chatbot], [msg, chatbot], queue=False, trigger_mode=\"once\"\n",
+    "    ).then(bot, [chatbot, conversation_id], chatbot, queue=True)\n",
     "    submit_click_event = submit.click(\n",
-    "        user, [msg, chatbot], [msg, chatbot], queue=False\n",
+    "        user, [msg, chatbot], [msg, chatbot], queue=False, trigger_mode=\"once\"\n",
     "    ).then(bot, [chatbot, conversation_id], chatbot, queue=True)\n",
-    "    stop.click(\n",
-    "        fn=None,\n",
-    "        inputs=None,\n",
-    "        outputs=None,\n",
-    "        cancels=[submit_event, submit_click_event],\n",
-    "        queue=False,\n",
-    "    )\n",
     "    clear.click(lambda: None, None, chatbot, queue=False)\n",
     "\n",
     "demo.queue(max_size=2)\n",
@@ -1285,7 +1417,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 28,
    "id": "6f4b5a84-bebf-49b9-b2fa-5e788ed2cbac",
    "metadata": {},
    "outputs": [
@@ -1293,7 +1425,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Closing server running on port: 7868\n"
+      "Closing server running on port: 4888\n"
      ]
     }
    ],
diff --git a/notebooks/254-llm-chatbot/config.py b/notebooks/254-llm-chatbot/config.py
index 5543bfbc9a9..cb0ecb71a83 100644
--- a/notebooks/254-llm-chatbot/config.py
+++ b/notebooks/254-llm-chatbot/config.py
@@ -127,19 +127,18 @@ def youri_partial_text_processor(partial_text, new_text):
     "chatglm3-6b": {
         "model_id": "THUDM/chatglm3-6b",
         "remote": True,
-        "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT }\n",
-        "history_template": "<|user|>\n{user}\n<|assistant|>\n{assistant}\n",
+        "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}</s>\n",
+        "history_template": "<|user|>\n{user}</s> \n<|assistant|>\n{assistant}</s> \n",
         "partial_text_processor": chatglm_partial_text_processor,
-        "current_message_template": "<|user|>\n{user}\n<|assistant|>\n",
+        "current_message_template": "<|user|>\n{user}</s> \n<|assistant|>\n{assistant}",
         "tokenizer_kwargs": {"add_special_tokens": False},
-        "stop_tokens": ["</s>", "[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"],
-        "prompt_template": f"""<|system|>
-        {DEFAULT_RAG_PROMPT_CHINESE }"""
+        "stop_tokens": ["</s>"],
+        "prompt_template": f"""<|system|> {DEFAULT_RAG_PROMPT_CHINESE }</s>"""
         + """
         <|user|>
         问题: {question} 
         已知内容: {context} 
-        回答: 
+        回答: </s>
         <|assistant|>""",
     },
     "mistral-7b": {