From e06fc8aacc6a66c83af009f69b96cd17b0e4cfcd Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Fri, 11 Aug 2023 17:34:36 +0800 Subject: [PATCH 1/8] add arena example --- examples/gradio_arena.py | 430 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 examples/gradio_arena.py diff --git a/examples/gradio_arena.py b/examples/gradio_arena.py new file mode 100644 index 0000000000..a482a623fe --- /dev/null +++ b/examples/gradio_arena.py @@ -0,0 +1,430 @@ +import uuid +from typing import TYPE_CHECKING, Dict, List, Optional + +import gradio as gr + +from xinference.locale.utils import Locale +from xinference.model.llm import BUILTIN_LLM_FAMILIES, LLMFamilyV1, match_llm +from xinference.model.llm.llm_family import cache +from xinference.client import RESTfulClient + +if TYPE_CHECKING: + from xinference.types import ChatCompletionChunk, ChatCompletionMessage + +MODEL_TO_FAMILIES: Dict[str, LLMFamilyV1] = dict( + (model_family.model_name, model_family) + for model_family in BUILTIN_LLM_FAMILIES + if "chat" in model_family.model_ability +) + + +class GradioApp: + def __init__( + self, + endpoint: str, + gladiator_num: int = 2, + max_model_num: int = 3, + use_launched_model: bool = False, + ): + self._api = RESTfulClient(endpoint) + self._gladiator_num = gladiator_num + self._max_model_num = max_model_num + self._use_launched_model = use_launched_model + self._locale = Locale() + + def _create_model( + self, + model_name: str, + model_size_in_billions: Optional[int] = None, + model_format: Optional[str] = None, + quantization: Optional[str] = None, + ): + models = self._api.list_models() + if len(models) >= self._max_model_num: + self._api.terminate_model(list(models.keys())[0]) + return self._api.launch_model( + model_name, model_size_in_billions, model_format, quantization + ) + + async def generate( + self, + model: str, + message: str, + chat: List[List[str]], + max_token: int, + temperature: float, + top_p: float, + window_size: int, + show_finish_reason: bool, + ): + if not message: + yield message, chat + else: + try: + model_ref = self._api.get_model(model) + except KeyError: + raise gr.Error(self._locale(f"Please create model first")) + + history: "List[ChatCompletionMessage]" = [] + for c in chat: + history.append({"role": "user", "content": c[0]}) + + out = c[1] + finish_reason_idx = out.find(f"[{self._locale('stop reason')}: ") + if finish_reason_idx != -1: + out = out[:finish_reason_idx] + history.append({"role": "assistant", "content": out}) + + if window_size != 0: + history = history[-(window_size // 2):] + + # chatglm only support even number of conversation history. + if len(history) % 2 != 0: + history = history[1:] + + generate_config = dict( + max_tokens=max_token, + temperature=temperature, + top_p=top_p, + stream=True, + ) + chat += [[message, ""]] + chat_generator = model_ref.chat( + message, + chat_history=history, + generate_config=generate_config, + ) + + chunk: Optional["ChatCompletionChunk"] = None + for chunk in chat_generator: + assert chunk is not None + delta = chunk["choices"][0]["delta"] + if "content" not in delta: + continue + else: + chat[-1][1] += delta["content"] + yield "", chat + if show_finish_reason and chunk is not None: + chat[-1][ + 1 + ] += f"[{self._locale('stop reason')}: {chunk['choices'][0]['finish_reason']}]" + yield "", chat + + def _build_chatbot(self, model_uid: str, model_name: str): + with gr.Accordion(self._locale("Parameters"), open=False): + max_token = gr.Slider( + 128, + 1024, + value=256, + step=1, + label=self._locale("Max tokens"), + info=self._locale("The maximum number of tokens to generate."), + ) + temperature = gr.Slider( + 0.2, + 1, + value=0.8, + step=0.01, + label=self._locale("Temperature"), + info=self._locale("The temperature to use for sampling."), + ) + top_p = gr.Slider( + 0.2, + 1, + value=0.95, + step=0.01, + label=self._locale("Top P"), + info=self._locale("The top-p value to use for sampling."), + ) + window_size = gr.Slider( + 0, + 50, + value=10, + step=1, + label=self._locale("Window size"), + info=self._locale("Window size of chat history."), + ) + show_finish_reason = gr.Checkbox( + label=f"{self._locale('Show stop reason')}" + ) + chat = gr.Chatbot(label=model_name) + text = gr.Textbox(visible=False) + model_uid = gr.Textbox(model_uid, visible=False) + text.change( + self.generate, + [ + model_uid, + text, + chat, + max_token, + temperature, + top_p, + window_size, + show_finish_reason, + ], + [text, chat], + ) + return ( + text, + chat, + max_token, + temperature, + top_p, + show_finish_reason, + window_size, + model_uid, + ) + + def _build_chat_column(self): + with gr.Column(): + with gr.Row(): + model_name = gr.Dropdown( + choices=list(MODEL_TO_FAMILIES.keys()), + label=self._locale("model name"), + scale=2, + ) + model_format = gr.Dropdown( + choices=[], + interactive=False, + label=self._locale("model format"), + scale=2, + ) + model_size_in_billions = gr.Dropdown( + choices=[], + interactive=False, + label=self._locale("model size in billions"), + scale=1, + ) + quantization = gr.Dropdown( + choices=[], + interactive=False, + label=self._locale("quantization"), + scale=1, + ) + create_model = gr.Button(value=self._locale("create")) + + def select_model_name(model_name: str): + if model_name: + model_family = MODEL_TO_FAMILIES[model_name] + formats = list( + {spec.model_format for spec in model_family.model_specs} + ) + formats.sort() + return ( + gr.Dropdown.update( + choices=formats, interactive=True, value=None + ), + gr.Dropdown.update(choices=[], interactive=False, value=None), + gr.Dropdown.update(choices=[], interactive=False, value=None), + ) + else: + return ( + gr.Dropdown.update(), + gr.Dropdown.update(), + gr.Dropdown.update(), + ) + + def select_model_format(model_name: str, model_format: str): + if model_name: + model_family = MODEL_TO_FAMILIES[model_name] + sizes = list( + { + spec.model_size_in_billions + for spec in model_family.model_specs + if spec.model_format == model_format + } + ) + sizes.sort() + return ( + gr.Dropdown.update( + choices=list(map(lambda s: str(s), sizes)), + interactive=True, + value=None, + ), + gr.Dropdown.update(choices=[], interactive=False, value=None), + ) + else: + return ( + gr.Dropdown.update(), + gr.Dropdown.update(), + ) + + def select_model_size( + model_name: str, model_format: str, model_size_in_billions: str + ): + if model_name: + model_family = MODEL_TO_FAMILIES[model_name] + quantizations = list( + { + quantization + for spec in model_family.model_specs + if spec.model_format == model_format + and str(spec.model_size_in_billions) + == model_size_in_billions + for quantization in spec.quantizations + } + ) + quantizations.sort() + return gr.Dropdown.update( + choices=quantizations, + interactive=True, + ) + else: + return gr.Dropdown.update() + + model_name.change( + select_model_name, + inputs=[model_name], + outputs=[model_format, model_size_in_billions, quantization], + ) + model_format.change( + select_model_format, + inputs=[model_name, model_format], + outputs=[model_size_in_billions, quantization], + ) + model_size_in_billions.change( + select_model_size, + inputs=[model_name, model_format, model_size_in_billions], + outputs=[quantization], + ) + + components = self._build_chatbot("", "") + model_text = components[0] + chat, model_uid = components[1], components[-1] + + def select_model( + _model_name: str, + _model_format: str, + _model_size_in_billions: str, + _quantization: str, + progress=gr.Progress(track_tqdm=True), + ): + match_result = match_llm( + _model_name, + _model_format, + int(_model_size_in_billions), + _quantization, + ) + if not match_result: + raise ValueError( + f"Model not found, name: {_model_name}, format: {_model_format}," + f" size: {_model_size_in_billions}, quantization: {_quantization}" + ) + + llm_family, llm_spec, _quantization = match_result + cache(llm_family, llm_spec, _quantization) + + model_uid = self._create_model( + _model_name, int(_model_size_in_billions), _model_format, _quantization + ) + return gr.Chatbot.update( + label="-".join( + [_model_name, _model_size_in_billions, _model_format, _quantization] + ), + value=[], + ), gr.Textbox.update(value=model_uid) + + def clear_chat( + _model_name: str, + _model_format: str, + _model_size_in_billions: str, + _quantization: str, + ): + full_name = "-".join( + [_model_name, _model_size_in_billions, _model_format, _quantization] + ) + return str(uuid.uuid4()), gr.Chatbot.update( + label=full_name, + value=[], + ) + + invisible_text = gr.Textbox(visible=False) + create_model.click( + clear_chat, + inputs=[model_name, model_format, model_size_in_billions, quantization], + outputs=[invisible_text, chat], + ) + + invisible_text.change( + select_model, + inputs=[model_name, model_format, model_size_in_billions, quantization], + outputs=[chat, model_uid], + postprocess=False, + ) + return chat, model_text + + def _build_arena(self): + with gr.Box(): + with gr.Row(): + chat_and_text = [ + self._build_chat_column() for _ in range(self._gladiator_num) + ] + chats = [c[0] for c in chat_and_text] + texts = [c[1] for c in chat_and_text] + + msg = gr.Textbox(label=self._locale("Input")) + + def update_message(text_in: str): + return "", text_in, text_in + + msg.submit(update_message, inputs=[msg], outputs=[msg] + texts) + + gr.ClearButton(components=[msg] + chats + texts) + + def _build_single(self): + chat, model_text = self._build_chat_column() + + msg = gr.Textbox(label=self._locale("Input")) + + def update_message(text_in: str): + return "", text_in + + msg.submit(update_message, inputs=[msg], outputs=[msg, model_text]) + gr.ClearButton(components=[chat, msg, model_text]) + + def build(self): + with gr.Blocks() as blocks: + with gr.Tab(self._locale("Chat")): + self._build_single() + with gr.Tab(self._locale("Arena")): + self._build_arena() + blocks.queue(concurrency_count=40) + blocks.launch() + + +if __name__ == "__main__": + import argparse + import textwrap + + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=textwrap.dedent( + """\ + instructions to run: + + 1. Install Xinference, Llama-cpp-python, and other dependencies if necessary + 2. Run command `xinference --host "localhost"` in terminal + 3. You should see something similar to the following output: + + INFO:xinference:Xinference successfully started. Endpoint: http://localhost:9997 + INFO:xinference.core.service:Worker 127.0.0.1:21561 has been added successfully + INFO:xinference.deploy.worker:Xinference worker successfully started. + + 4. In the output, locate the endpoint. In the above case it is `http://localhost:9997` + 5. Run this python file in new terminal window, change the endpoint accordingly + + example (feel free to copy): + + python gradio_arena.py \\ + --endpoint http://localhost:9997 + """ + ), + ) + + parser.add_argument( + "--endpoint", type=str, required=True, help="Xinference endpoint, required" + ) + + args = parser.parse_args() + print(f"Xinference endpoint: {args.endpoint}") + GradioApp(args.endpoint).build() From 1d8ef2a2aa25483c55e8702b49cae62f933771d5 Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Fri, 11 Aug 2023 17:35:05 +0800 Subject: [PATCH 2/8] update instruction --- examples/gradio_chatinterface.py | 40 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/examples/gradio_chatinterface.py b/examples/gradio_chatinterface.py index b459e8ba44..6f6a9ccd8c 100644 --- a/examples/gradio_chatinterface.py +++ b/examples/gradio_chatinterface.py @@ -12,22 +12,28 @@ formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent( """\ - instructions to run: - 1. Install Xinference and Llama-cpp-python - 2. Run 'xinference --host "localhost" --port 9997' in terminal - 3. Run this python file in new terminal window - - e.g. (feel free to copy) - python gradio_chatinterface.py \\ - --endpoint http://localhost:9997 \\ - --model_name vicuna-v1.3 \\ - --model_size_in_billions 7 \\ - --model_format ggmlv3 \\ - --quantization q2_K - - If you decide to change the port number in step 2, - please also change the endpoint in the arguments - """ + instructions to run: + + 1. Install Xinference, Llama-cpp-python, and other dependencies if necessary + 2. Run command `xinference --host "localhost"` in terminal + 3. You should see something similar to the following output: + + INFO:xinference:Xinference successfully started. Endpoint: http://localhost:9997 + INFO:xinference.core.service:Worker 127.0.0.1:21561 has been added successfully + INFO:xinference.deploy.worker:Xinference worker successfully started. + + 4. In the output, locate the endpoint. In the above case it is `http://localhost:9997` + 5. Run this python file in new terminal window, change the endpoint accordingly + + example (feel free to copy): + + python gradio_chatinterface.py \\ + --endpoint http://localhost:9997 \\ + --model_name vicuna-v1.3 \\ + --model_size_in_billions 7 \\ + --model_format ggmlv3 \\ + --quantization q2_K + """ ), ) @@ -95,6 +101,7 @@ def to_chat(lst: List[str]) -> List[Dict[str, str]]: ) return res + def generate_wrapper(message: str, history: List[List[str]]) -> str: output = model.chat( prompt=message, @@ -103,6 +110,7 @@ def generate_wrapper(message: str, history: List[List[str]]) -> str: ) return output["choices"][0]["message"]["content"] + demo = gr.ChatInterface( fn=generate_wrapper, analytics_enabled=False, From 097b64d069777fb2894d02179c295f66967972b5 Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Fri, 11 Aug 2023 17:37:11 +0800 Subject: [PATCH 3/8] fix lint --- examples/gradio_arena.py | 66 ++++++++++++++++---------------- examples/gradio_chatinterface.py | 2 - 2 files changed, 33 insertions(+), 35 deletions(-) diff --git a/examples/gradio_arena.py b/examples/gradio_arena.py index a482a623fe..740aac21bc 100644 --- a/examples/gradio_arena.py +++ b/examples/gradio_arena.py @@ -3,10 +3,10 @@ import gradio as gr +from xinference.client import RESTfulClient from xinference.locale.utils import Locale from xinference.model.llm import BUILTIN_LLM_FAMILIES, LLMFamilyV1, match_llm from xinference.model.llm.llm_family import cache -from xinference.client import RESTfulClient if TYPE_CHECKING: from xinference.types import ChatCompletionChunk, ChatCompletionMessage @@ -20,11 +20,11 @@ class GradioApp: def __init__( - self, - endpoint: str, - gladiator_num: int = 2, - max_model_num: int = 3, - use_launched_model: bool = False, + self, + endpoint: str, + gladiator_num: int = 2, + max_model_num: int = 3, + use_launched_model: bool = False, ): self._api = RESTfulClient(endpoint) self._gladiator_num = gladiator_num @@ -33,11 +33,11 @@ def __init__( self._locale = Locale() def _create_model( - self, - model_name: str, - model_size_in_billions: Optional[int] = None, - model_format: Optional[str] = None, - quantization: Optional[str] = None, + self, + model_name: str, + model_size_in_billions: Optional[int] = None, + model_format: Optional[str] = None, + quantization: Optional[str] = None, ): models = self._api.list_models() if len(models) >= self._max_model_num: @@ -47,15 +47,15 @@ def _create_model( ) async def generate( - self, - model: str, - message: str, - chat: List[List[str]], - max_token: int, - temperature: float, - top_p: float, - window_size: int, - show_finish_reason: bool, + self, + model: str, + message: str, + chat: List[List[str]], + max_token: int, + temperature: float, + top_p: float, + window_size: int, + show_finish_reason: bool, ): if not message: yield message, chat @@ -76,7 +76,7 @@ async def generate( history.append({"role": "assistant", "content": out}) if window_size != 0: - history = history[-(window_size // 2):] + history = history[-(window_size // 2) :] # chatglm only support even number of conversation history. if len(history) % 2 != 0: @@ -250,7 +250,7 @@ def select_model_format(model_name: str, model_format: str): ) def select_model_size( - model_name: str, model_format: str, model_size_in_billions: str + model_name: str, model_format: str, model_size_in_billions: str ): if model_name: model_family = MODEL_TO_FAMILIES[model_name] @@ -259,8 +259,8 @@ def select_model_size( quantization for spec in model_family.model_specs if spec.model_format == model_format - and str(spec.model_size_in_billions) - == model_size_in_billions + and str(spec.model_size_in_billions) + == model_size_in_billions for quantization in spec.quantizations } ) @@ -293,11 +293,11 @@ def select_model_size( chat, model_uid = components[1], components[-1] def select_model( - _model_name: str, - _model_format: str, - _model_size_in_billions: str, - _quantization: str, - progress=gr.Progress(track_tqdm=True), + _model_name: str, + _model_format: str, + _model_size_in_billions: str, + _quantization: str, + progress=gr.Progress(track_tqdm=True), ): match_result = match_llm( _model_name, @@ -325,10 +325,10 @@ def select_model( ), gr.Textbox.update(value=model_uid) def clear_chat( - _model_name: str, - _model_format: str, - _model_size_in_billions: str, - _quantization: str, + _model_name: str, + _model_format: str, + _model_size_in_billions: str, + _quantization: str, ): full_name = "-".join( [_model_name, _model_size_in_billions, _model_format, _quantization] diff --git a/examples/gradio_chatinterface.py b/examples/gradio_chatinterface.py index 6f6a9ccd8c..41f99c4a79 100644 --- a/examples/gradio_chatinterface.py +++ b/examples/gradio_chatinterface.py @@ -101,7 +101,6 @@ def to_chat(lst: List[str]) -> List[Dict[str, str]]: ) return res - def generate_wrapper(message: str, history: List[List[str]]) -> str: output = model.chat( prompt=message, @@ -110,7 +109,6 @@ def generate_wrapper(message: str, history: List[List[str]]) -> str: ) return output["choices"][0]["message"]["content"] - demo = gr.ChatInterface( fn=generate_wrapper, analytics_enabled=False, From 9420aa763119ad126346aee7f0599290a0f9fce4 Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Fri, 11 Aug 2023 17:42:16 +0800 Subject: [PATCH 4/8] update instruction --- examples/gradio_arena.py | 2 +- examples/gradio_chatinterface.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gradio_arena.py b/examples/gradio_arena.py index 740aac21bc..2d0e7b4e67 100644 --- a/examples/gradio_arena.py +++ b/examples/gradio_arena.py @@ -413,7 +413,7 @@ def build(self): 4. In the output, locate the endpoint. In the above case it is `http://localhost:9997` 5. Run this python file in new terminal window, change the endpoint accordingly - example (feel free to copy): + example run command (feel free to copy): python gradio_arena.py \\ --endpoint http://localhost:9997 diff --git a/examples/gradio_chatinterface.py b/examples/gradio_chatinterface.py index 41f99c4a79..7bd8a8ab94 100644 --- a/examples/gradio_chatinterface.py +++ b/examples/gradio_chatinterface.py @@ -25,7 +25,7 @@ 4. In the output, locate the endpoint. In the above case it is `http://localhost:9997` 5. Run this python file in new terminal window, change the endpoint accordingly - example (feel free to copy): + example run command (feel free to copy): python gradio_chatinterface.py \\ --endpoint http://localhost:9997 \\ From 01f3419d1a5840f103e591f598dfc90d18d810f3 Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Mon, 14 Aug 2023 11:33:03 +0800 Subject: [PATCH 5/8] remove model tab --- examples/gradio_arena.py | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/examples/gradio_arena.py b/examples/gradio_arena.py index 2d0e7b4e67..e74826f4e3 100644 --- a/examples/gradio_arena.py +++ b/examples/gradio_arena.py @@ -9,7 +9,7 @@ from xinference.model.llm.llm_family import cache if TYPE_CHECKING: - from xinference.types import ChatCompletionChunk, ChatCompletionMessage + from xinference.types import ChatCompletionMessage MODEL_TO_FAMILIES: Dict[str, LLMFamilyV1] = dict( (model_family.model_name, model_family) @@ -86,29 +86,21 @@ async def generate( max_tokens=max_token, temperature=temperature, top_p=top_p, - stream=True, + stream=False, ) chat += [[message, ""]] - chat_generator = model_ref.chat( + chat_response = model_ref.chat( message, chat_history=history, generate_config=generate_config, ) - chunk: Optional["ChatCompletionChunk"] = None - for chunk in chat_generator: - assert chunk is not None - delta = chunk["choices"][0]["delta"] - if "content" not in delta: - continue - else: - chat[-1][1] += delta["content"] - yield "", chat - if show_finish_reason and chunk is not None: + chat[-1][1] += chat_response["choices"][0]["message"]["content"] + if show_finish_reason and chat_response is not None: chat[-1][ 1 - ] += f"[{self._locale('stop reason')}: {chunk['choices'][0]['finish_reason']}]" - yield "", chat + ] += f"[{self._locale('stop reason')}: {chat_response['choices'][0]['finish_reason']}]" + yield "", chat def _build_chatbot(self, model_uid: str, model_name: str): with gr.Accordion(self._locale("Parameters"), open=False): @@ -371,23 +363,9 @@ def update_message(text_in: str): gr.ClearButton(components=[msg] + chats + texts) - def _build_single(self): - chat, model_text = self._build_chat_column() - - msg = gr.Textbox(label=self._locale("Input")) - - def update_message(text_in: str): - return "", text_in - - msg.submit(update_message, inputs=[msg], outputs=[msg, model_text]) - gr.ClearButton(components=[chat, msg, model_text]) - def build(self): with gr.Blocks() as blocks: - with gr.Tab(self._locale("Chat")): - self._build_single() - with gr.Tab(self._locale("Arena")): - self._build_arena() + self._build_arena() blocks.queue(concurrency_count=40) blocks.launch() From 900aa245fd2e2e9289471cc5f534c84ab9517fdb Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Mon, 14 Aug 2023 15:52:42 +0800 Subject: [PATCH 6/8] resolve momments --- doc/source/examples/ai_podcast.rst | 4 +- doc/source/examples/arena.rst | 33 ++++++ doc/source/examples/index.rst | 3 +- examples/LangChain_QA.ipynb | 169 ++++++++++++++++++++++++----- examples/gradio_arena.py | 13 +++ examples/gradio_chatinterface.py | 13 +++ 6 files changed, 203 insertions(+), 32 deletions(-) create mode 100644 doc/source/examples/arena.rst diff --git a/doc/source/examples/ai_podcast.rst b/doc/source/examples/ai_podcast.rst index 57ff40bca7..eb688d895d 100644 --- a/doc/source/examples/ai_podcast.rst +++ b/doc/source/examples/ai_podcast.rst @@ -74,6 +74,4 @@ Chinese (AI_Podcast_ZH.py) **Source Code** : - * `AI_Podcast `_ (English Version) - - * AI_Podcast_ZH (Chinese Version) \ No newline at end of file + * `AI_Podcast ` \ No newline at end of file diff --git a/doc/source/examples/arena.rst b/doc/source/examples/arena.rst new file mode 100644 index 0000000000..c11b789e26 --- /dev/null +++ b/doc/source/examples/arena.rst @@ -0,0 +1,33 @@ +.. _dual_model_chatbot: + +========================== +Example: Chatbot Arena 🤼️ +========================== + +**Description**: + +Experience the thrill of conversing with two AI models simultaneously! This interface allows you to launch two models side by side and engage them in a chat. Enter a prompt, and watch as both models respond to your inquiry 🎙️ + +**Friendly Warning**: + Please do not try to open both models together. Patiently wait until one is finished launching before starting the next. Similarly, wait until you've received responses from both models before moving on to the next question. Thank you for understanding! 🚦 + +**Used Technology**: + + @ `ggerganov `_ 's `ggml `_ + + @ `Xinference `_ as a launcher + + @ All LLaMA and Chatglm models supported by `Xorbitsio inference `_ + +**Detailed Explanation on the Demo Functionality** : + +1. Launch the two models with all required parameters selected by the user. + +2. Initialize two separate chat histories to store the context for both models. + +3. Prompt the user for input and simultaneously pass it to both models, generating individual responses. + +5. Show the outputs of both models in the interface. Respective chat histories serve as context for upcoming rounds. + +**Source Code** : + * `dual_chat `_ diff --git a/doc/source/examples/index.rst b/doc/source/examples/index.rst index ef8d3596a9..ebcfd48590 100644 --- a/doc/source/examples/index.rst +++ b/doc/source/examples/index.rst @@ -9,4 +9,5 @@ Examples :hidden: ai_podcast - chatbot \ No newline at end of file + chatbot + arena \ No newline at end of file diff --git a/examples/LangChain_QA.ipynb b/examples/LangChain_QA.ipynb index 258449b888..b4aaf644a3 100644 --- a/examples/LangChain_QA.ipynb +++ b/examples/LangChain_QA.ipynb @@ -2,21 +2,33 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ - "# LangChain QA Application with Xinference and LangChain\n" + "# LangChain QA Application with Xinference and LangChain" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "This demo walks through how to build an LLM-driven question-answering (QA) application with Xinference, Milvus, and LangChain." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Deploy Xinference Locally or in a Distributed Cluster.\n", "\n", @@ -35,7 +47,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -51,14 +67,22 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "The command will return a model UID for you to use." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Prepare the Documents" ] @@ -66,7 +90,11 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.document_loaders import TextLoader\n", @@ -86,7 +114,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Set Up an Embedding Model" ] @@ -94,7 +126,11 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.embeddings import XinferenceEmbeddings\n", @@ -107,14 +143,22 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Connect to the Vector Database" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "For vector store, we use the Milvus vector database. [Milvus](https://milvus.io/docs/overview.md) is a database that stores, indexes, and manages massive embedding vectors generated by deep neural networks and other machine learning models. To run, you should first [Install Milvus Standalone with Docker Compose](https://milvus.io/docs/install_standalone-docker.md)." ] @@ -125,6 +169,9 @@ "metadata": { "vscode": { "languageId": "bat" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [], @@ -134,7 +181,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "In the same directory as the docker-compose.yml file, start up Milvus and connect to Milvus by running:" ] @@ -145,6 +196,9 @@ "metadata": { "vscode": { "languageId": "bat" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [], @@ -156,7 +210,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.vectorstores import Milvus\n", @@ -170,7 +228,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "## Query about the Document" ] @@ -178,7 +240,11 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -199,7 +265,11 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.llms import Xinference\n", @@ -212,7 +282,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "We can now create a memory object to track the chat history." ] @@ -220,7 +294,11 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.memory import ConversationBufferMemory\n", @@ -229,7 +307,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Now we create ConversationalRetrievalChain with chat model and the vectorstore." ] @@ -237,7 +319,11 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "from langchain.chains import ConversationalRetrievalChain\n", @@ -250,7 +336,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "Now, we can query information from the document. Instead of simply returning identical sentences from the document, the model generates responses by summarizing relevant content. Furthermore, it can relate a new query to the chat history, creating a chain of responses that build upon each other. " ] @@ -258,7 +348,11 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -280,7 +374,11 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -302,7 +400,11 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { @@ -323,7 +425,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "\n", "From the second query, we can see that LLM accurately recognizes that \"he\" refers to \"the president\", and \"she\" refers to \"Ketanji Brown Jackson\" mentioned in the previous query. Moreover, even though the name of the President is not mentioned anywhere in the entire article, LLM is able to identify that the speaker of this article is President Joe Biden. Moreover, the LLM summarizes President's opinion on COVID-19 in a concise way. We can see the impressive capabilities of LLM, and LangChain's \"chaining\" feature also allows for more coherent and context-aware interactions with the model." @@ -331,7 +437,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "To stop Milvus and delete data after stopping Milvus, run:" ] @@ -342,6 +452,9 @@ "metadata": { "vscode": { "languageId": "bat" + }, + "pycharm": { + "name": "#%%\n" } }, "outputs": [], @@ -374,4 +487,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/examples/gradio_arena.py b/examples/gradio_arena.py index e74826f4e3..c597ce9469 100644 --- a/examples/gradio_arena.py +++ b/examples/gradio_arena.py @@ -1,3 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import uuid from typing import TYPE_CHECKING, Dict, List, Optional diff --git a/examples/gradio_chatinterface.py b/examples/gradio_chatinterface.py index 7bd8a8ab94..3d59bbb421 100644 --- a/examples/gradio_chatinterface.py +++ b/examples/gradio_chatinterface.py @@ -1,3 +1,16 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from typing import Dict, List import gradio as gr From d47c57b0034bfc5f1f7f818ad5ab63ec4c52e3da Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Mon, 14 Aug 2023 16:08:34 +0800 Subject: [PATCH 7/8] fix notebook --- examples/LangChain_QA.ipynb | 169 ++++++------------------------------ 1 file changed, 28 insertions(+), 141 deletions(-) diff --git a/examples/LangChain_QA.ipynb b/examples/LangChain_QA.ipynb index b4aaf644a3..258449b888 100644 --- a/examples/LangChain_QA.ipynb +++ b/examples/LangChain_QA.ipynb @@ -2,33 +2,21 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ - "# LangChain QA Application with Xinference and LangChain" + "# LangChain QA Application with Xinference and LangChain\n" ] }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "This demo walks through how to build an LLM-driven question-answering (QA) application with Xinference, Milvus, and LangChain." ] }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Deploy Xinference Locally or in a Distributed Cluster.\n", "\n", @@ -47,11 +35,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -67,22 +51,14 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "The command will return a model UID for you to use." ] }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Prepare the Documents" ] @@ -90,11 +66,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import TextLoader\n", @@ -114,11 +86,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Set Up an Embedding Model" ] @@ -126,11 +94,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.embeddings import XinferenceEmbeddings\n", @@ -143,22 +107,14 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Connect to the Vector Database" ] }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "For vector store, we use the Milvus vector database. [Milvus](https://milvus.io/docs/overview.md) is a database that stores, indexes, and manages massive embedding vectors generated by deep neural networks and other machine learning models. To run, you should first [Install Milvus Standalone with Docker Compose](https://milvus.io/docs/install_standalone-docker.md)." ] @@ -169,9 +125,6 @@ "metadata": { "vscode": { "languageId": "bat" - }, - "pycharm": { - "name": "#%%\n" } }, "outputs": [], @@ -181,11 +134,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "In the same directory as the docker-compose.yml file, start up Milvus and connect to Milvus by running:" ] @@ -196,9 +145,6 @@ "metadata": { "vscode": { "languageId": "bat" - }, - "pycharm": { - "name": "#%%\n" } }, "outputs": [], @@ -210,11 +156,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import Milvus\n", @@ -228,11 +170,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Query about the Document" ] @@ -240,11 +178,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -265,11 +199,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.llms import Xinference\n", @@ -282,11 +212,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "We can now create a memory object to track the chat history." ] @@ -294,11 +220,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.memory import ConversationBufferMemory\n", @@ -307,11 +229,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "Now we create ConversationalRetrievalChain with chat model and the vectorstore." ] @@ -319,11 +237,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [ "from langchain.chains import ConversationalRetrievalChain\n", @@ -336,11 +250,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "Now, we can query information from the document. Instead of simply returning identical sentences from the document, the model generates responses by summarizing relevant content. Furthermore, it can relate a new query to the chat history, creating a chain of responses that build upon each other. " ] @@ -348,11 +258,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -374,11 +280,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -400,11 +302,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [ { "data": { @@ -425,11 +323,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "\n", "From the second query, we can see that LLM accurately recognizes that \"he\" refers to \"the president\", and \"she\" refers to \"Ketanji Brown Jackson\" mentioned in the previous query. Moreover, even though the name of the President is not mentioned anywhere in the entire article, LLM is able to identify that the speaker of this article is President Joe Biden. Moreover, the LLM summarizes President's opinion on COVID-19 in a concise way. We can see the impressive capabilities of LLM, and LangChain's \"chaining\" feature also allows for more coherent and context-aware interactions with the model." @@ -437,11 +331,7 @@ }, { "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "To stop Milvus and delete data after stopping Milvus, run:" ] @@ -452,9 +342,6 @@ "metadata": { "vscode": { "languageId": "bat" - }, - "pycharm": { - "name": "#%%\n" } }, "outputs": [], @@ -487,4 +374,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From 5b76916e11a818d103a229418456d41eaf96f79f Mon Sep 17 00:00:00 2001 From: Bojun-Feng Date: Mon, 14 Aug 2023 16:52:18 +0800 Subject: [PATCH 8/8] fix docs typo --- doc/source/examples/arena.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/examples/arena.rst b/doc/source/examples/arena.rst index c11b789e26..f5318928dd 100644 --- a/doc/source/examples/arena.rst +++ b/doc/source/examples/arena.rst @@ -8,8 +8,9 @@ Example: Chatbot Arena 🤼️ Experience the thrill of conversing with two AI models simultaneously! This interface allows you to launch two models side by side and engage them in a chat. Enter a prompt, and watch as both models respond to your inquiry 🎙️ -**Friendly Warning**: - Please do not try to open both models together. Patiently wait until one is finished launching before starting the next. Similarly, wait until you've received responses from both models before moving on to the next question. Thank you for understanding! 🚦 +**Notice**: + +Please do not try to open both models together. Patiently wait until one is finished launching before starting the next. Similarly, wait until you've received responses from both models before moving on to the next question. Thank you for understanding! 🚦 **Used Technology**: @@ -30,4 +31,4 @@ Experience the thrill of conversing with two AI models simultaneously! This inte 5. Show the outputs of both models in the interface. Respective chat histories serve as context for upcoming rounds. **Source Code** : - * `dual_chat `_ + * `arena `_