From 2b186f2703b3f3102c05abe2123f004094d5262e Mon Sep 17 00:00:00 2001 From: Ofir Zafrir Date: Fri, 15 Mar 2024 22:11:02 +0200 Subject: [PATCH] Add Phi-2 on Intel's MTL iGPU demo notebook (#606) * Add phi-2 notebook * notebook update * remove stateful setting use default instead --- notebooks/openvino/phi-2_on_mtl.ipynb | 583 ++++++++++++++++++++++++++ 1 file changed, 583 insertions(+) create mode 100644 notebooks/openvino/phi-2_on_mtl.ipynb diff --git a/notebooks/openvino/phi-2_on_mtl.ipynb b/notebooks/openvino/phi-2_on_mtl.ipynb new file mode 100644 index 0000000000..88f0387f05 --- /dev/null +++ b/notebooks/openvino/phi-2_on_mtl.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aeb16663-be53-4260-b62d-44611b6771ec", + "metadata": {}, + "source": [ + "# Chat and Code with Phi-2 with OpenVINO and 🤗 Optimum on Intel Meteor Lake iGPU\n", + "In this notebook we will show how to export and apply weight only quantization on Phi-2 to 4 bits.\n", + "Then using the quantized model we will show how to generate code completions with the model running on Intel Meteor Lake iGPU presenting a good experience of running GenAI locally on Intel PC marking the start of the AIPC Era!\n", + "Then we will show how to talk with Phi-2 in a ChatBot demo running completely locally on your Laptop!\n", + "\n", + "[Phi-2](https://huggingface.co/microsoft/phi-2) is a 2.7 billion-parameter language model trained by Microsoft. Microsoft in the model's release [blog post](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) states that Phi-2:\n", + "> demonstrates outstanding reasoning and language understanding capabilities, showcasing state-of-the-art performance among base language models with less than 13 billion parameters. On complex benchmarks Phi-2 matches or outperforms models up to 25x larger, thanks to new innovations in model scaling and training data curation." + ] + }, + { + "cell_type": "markdown", + "id": "03cb49cf-bc6f-4702-a61f-227b352404cb", + "metadata": {}, + "source": [ + "## Install dependencies\n", + "Make sure you have the latest GPU drivers installed on your machine: https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html.\n", + "\n", + "We will start by installing the dependencies, that can be done by uncommenting the following cell and run it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96d8203c-34c9-41a2-95bd-3891533840a1", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install optimum[openvino,nncf] torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5980ce40-0be1-48c1-941a-92c484d4da31", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from transformers import AutoTokenizer\n", + "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig" + ] + }, + { + "cell_type": "markdown", + "id": "48b81857-a095-43a3-8c8d-4c880b743a6e", + "metadata": {}, + "source": [ + "## Configuration\n", + "Here we will configure which model to load and other attributes. We will explain everything 😄\n", + "* `model_name`: the name or path of the model we want to export and quantize, can be either on the 🤗 Hub or a local directory on your laptop.\n", + "* `save_name`: directory where the exported & quantized model will be saved.\n", + "* `precision`: the compute data type we will use for inference of the model, can be either `f32` or `f16`. We use FP32 precision due to Phi-2 overflow issues in FP16.\n", + "* `quantization_config`: here we set the attributes for the weight only quantization algorithm:\n", + " * `bits`: number of bits to use for quantization, can be either `8` or `4`.\n", + " * `sym`: whether to use symmetric quantization or not, can be either `True` or `False`.\n", + " * `group_size`: number of weights to group together for quantization. We use groups of 128 to ensure no accuracy degradation.\n", + " * `ratio`: the ratio of the model to quantize to #`bits`. The rest will be quantize to the default bits number, `8`.\n", + "* `device`: the device to use for inference, can be either `cpu` or `gpu`.\n", + "* `stateful`: Optimize model by setting the KV cache as part of the models state instead of as an input\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800cd7a3-a21d-4a0a-9d73-2a2d08646f99", + "metadata": {}, + "outputs": [], + "source": [ + "model_name = 'microsoft/phi-2'\n", + "save_name = './phi-2-woq4'\n", + "precision = 'f32'\n", + "quantization_config = OVWeightQuantizationConfig(\n", + " bits=4,\n", + " sym=False,\n", + " group_size=128,\n", + " ratio=0.8,\n", + ")\n", + "device = 'gpu'" + ] + }, + { + "cell_type": "markdown", + "id": "1f398868-93d7-4c2d-9591-9bac8e9b701c", + "metadata": {}, + "source": [ + "With this configuration we expect the model size to reduce to around to 1.62GB: $0.8 \\times 2.7{\\times}10^3 \\times \\frac{1}{2}\\text{B} + 0.2 * 2.7{\\times}10^3 \\times 1\\text{B} = 1.62{\\times}10^3\\text{B} = 1.62\\text{GB}$" + ] + }, + { + "cell_type": "markdown", + "id": "d994997d-344c-4d6c-ab08-f78ecb7f56ec", + "metadata": {}, + "source": [ + "## Export & quantize\n", + "OpenVINO together with 🤗 Optimum enables you to load, export and quantize a model in a single `from_pretrained` call making the process as simple as possible.\n", + "Then, we will save the exported & quantized model locally on our laptop. If the model was already exported and saved before we will load the locally saved model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a308c6-27e7-4926-8ac4-4fa0c1ca68d2", + "metadata": {}, + "outputs": [], + "source": [ + "# Load kwargs\n", + "load_kwargs = {\n", + " 'device': device,\n", + " 'ov_config': {\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"INFERENCE_PRECISION_HINT\": precision,\n", + " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n", + " },\n", + " 'compile': False,\n", + " 'quantization_config': quantization_config\n", + "}\n", + "\n", + "# Check whether the model was already exported\n", + "saved = os.path.exists(save_name)\n", + "\n", + "model = OVModelForCausalLM.from_pretrained(\n", + " model_name if not saved else save_name,\n", + " export=not saved,\n", + " **load_kwargs,\n", + ")\n", + "\n", + "# Load tokenizer to be used with the model\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name if not saved else save_name)\n", + "\n", + "# Save the exported model locally\n", + "if not saved:\n", + " model.save_pretrained(save_name)\n", + " tokenizer.save_pretrained(save_name)\n", + "\n", + "# TODO Optional: export to huggingface/hub\n", + "\n", + "model_size = os.stat(os.path.join(save_name, 'openvino_model.bin')).st_size / 1024 ** 3\n", + "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')" + ] + }, + { + "cell_type": "markdown", + "id": "592e118d-e8bb-491f-92b2-d0418e19158c", + "metadata": {}, + "source": [ + "We can see the model size was reduced to 1.7GB as expected. After loading the model we can switch the model between devices using `model.to('gpu')` for example.\n", + "After we have finished to configure everything, we can compile the model by calling `model.compile()` and the model will be ready for usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cef4dc0-191e-4755-a639-c3e8adbd18a2", + "metadata": {}, + "outputs": [], + "source": [ + "model.compile()" + ] + }, + { + "cell_type": "markdown", + "id": "dd3c467e-3bbb-4265-9075-1c6688af2f92", + "metadata": {}, + "source": [ + "## Generate using the exported model\n", + "We will now show an example where we will use our quantized Phi-2 to generate code in Python. \n", + "Phi-2 knows how to do code completions where the model is given a function's signature and its docstring and the model will generate the implementation of the function.\n", + "\n", + "In our example we have taken one of the samples from the test set of HumanEval dataset. \n", + "HumanEval is a code completion dataset used to train and benchmark models on code completion in Python. \n", + "Phi-2 has scored a remarkable result on the HumanEval dataset and is an excellent model to use for code completions.\n", + "\n", + "Note: the first time you run the model might take more time due to loading and compilation overheads of the first inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b4ea738-7db5-490e-9338-d6420b77796c", + "metadata": {}, + "outputs": [], + "source": [ + "sample = \"\"\"from typing import List\n", + "\n", + "\n", + "def has_close_elements(numbers: List[float], threshold: float) -> bool:\n", + " \\\"\\\"\\\" Check if in given list of numbers, are any two numbers closer to each other than\n", + " given threshold.\n", + " >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n", + " False\n", + " >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n", + " True\n", + " \\\"\\\"\\\"\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14ffe7f9-7d93-4a49-95d8-5f2a4e400cfe", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import TextStreamer\n", + "\n", + "# Tokenize the sample\n", + "inputs = tokenizer([sample], return_tensors='pt')\n", + "\n", + "# Call generate on the inputs\n", + "out = model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", + " pad_token_id=tokenizer.eos_token_id,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3f8aa25c-de59-4e79-9a1f-c03ec76d206a", + "metadata": {}, + "source": [ + "## Chatbot demo\n", + "We will continue to build a chatbot demo running with Gradio using the model we just exported and quantized.\n", + "The chatbot will be rather simple where the user will input a message and the model will reply to the user by generating text using the entire chat history as the input to the model.\n", + "\n", + "A lot of models that were trained for the chatbot use case have been trained with special tokens to tell the model who is the current speaker and with a special system message. \n", + "Phi-2 wasn't trained specifically for the chatbot use case and doesn't have any special tokens either, however, it has seen chats in the training data and therefore is suited for that use case.\n", + "\n", + "The chat template we will use is rather simple:\n", + "```\n", + "User: \n", + "Assistant: \n", + "User: \n", + "...\n", + "```\n", + "\n", + "We will start by writing the core function of the chatbot that receives the entire history of the chat and generates the assistant's response.\n", + "To support this core function we will build a few assistant functions to prepare the input for the model and to stop generation in time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e81d125-ff47-4122-853d-11a2763db146", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from threading import Thread\n", + "\n", + "from transformers import (\n", + " TextIteratorStreamer,\n", + " StoppingCriteria,\n", + " StoppingCriteriaList,\n", + " GenerationConfig,\n", + ")\n", + "\n", + "\n", + "# Copied and modified from https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/generation.py#L13\n", + "class SuffixCriteria(StoppingCriteria):\n", + " def __init__(self, start_length, eof_strings, tokenizer, check_fn=None):\n", + " self.start_length = start_length\n", + " self.eof_strings = eof_strings\n", + " self.tokenizer = tokenizer\n", + " if check_fn is None:\n", + " check_fn = lambda decoded_generation: any(\n", + " [decoded_generation.endswith(stop_string) for stop_string in self.eof_strings]\n", + " )\n", + " self.check_fn = check_fn\n", + "\n", + " def __call__(self, input_ids, scores, **kwargs):\n", + " \"\"\"Returns True if generated sequence ends with any of the stop strings\"\"\"\n", + " decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])\n", + " return all([self.check_fn(decoded_generation) for decoded_generation in decoded_generations])\n", + "\n", + "\n", + "def is_partial_stop(output, stop_str):\n", + " \"\"\"Check whether the output contains a partial stop str.\"\"\"\n", + " for i in range(0, min(len(output), len(stop_str))):\n", + " if stop_str.startswith(output[-i:]):\n", + " return True\n", + " return False\n", + "\n", + "\n", + "\n", + "# Set the chat template to the tokenizer. The chat template implements the simple template of\n", + "# User: content\n", + "# Assistant: content\n", + "# ...\n", + "# Read more about chat templates here https://huggingface.co/docs/transformers/main/en/chat_templating\n", + "tokenizer.chat_template = \"{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}\"\n", + "\n", + "\n", + "def prepare_history_for_model(history):\n", + " \"\"\"\n", + " Converts the history to a tokenized prompt in the format expected by the model.\n", + " Params:\n", + " history: dialogue history\n", + " Returns:\n", + " Tokenized prompt\n", + " \"\"\"\n", + " messages = []\n", + " for idx, (user_msg, model_msg) in enumerate(history):\n", + " # skip the last assistant message if its empty, the tokenizer will do the formating\n", + " if idx == len(history) - 1 and not model_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " break\n", + " if user_msg:\n", + " messages.append({'role': 'User', 'content': user_msg})\n", + " if model_msg:\n", + " messages.append({'role': 'Assistant', 'content': model_msg})\n", + " input_token = tokenizer.apply_chat_template(\n", + " messages,\n", + " add_generation_prompt=True,\n", + " tokenize=True,\n", + " return_tensors=\"pt\",\n", + " return_dict=True\n", + " )\n", + " return input_token\n", + "\n", + "\n", + "def generate(history, temperature, max_new_tokens, top_p, repetition_penalty):\n", + " \"\"\"\n", + " Generates the assistant's reponse given the chatbot history and generation parameters\n", + "\n", + " Params:\n", + " history: conversation history formated in pairs of user and assistant messages `[user_message, assistant_message]`\n", + " temperature: parameter for control the level of creativity in AI-generated text.\n", + " By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.\n", + " max_new_tokens: The maximum number of tokens we allow the model to generate as a response.\n", + " top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.\n", + " repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.\n", + " Yields:\n", + " Updated history and generation status.\n", + " \"\"\"\n", + " start = time.perf_counter()\n", + " # Construct the input message string for the model by concatenating the current system message and conversation history\n", + " # Tokenize the messages string\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + " # truncate input in case it is too long.\n", + " # TODO improve this\n", + " if input_length > 2000:\n", + " history = [history[-1]]\n", + " inputs = prepare_history_for_model(history)\n", + " input_length = inputs['input_ids'].shape[1]\n", + "\n", + " prompt_char = '▌'\n", + " history[-1][1] = prompt_char\n", + " yield (history, 'Status: Generating...')\n", + " \n", + " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", + "\n", + " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", + " stop_str = f'\\nUser:'\n", + " stopping_criteria = StoppingCriteriaList([SuffixCriteria(input_length, [stop_str], tokenizer)])\n", + " # Prepare input for generate\n", + " generation_config = GenerationConfig(\n", + " max_new_tokens=max_new_tokens,\n", + " do_sample=temperature > 0.0,\n", + " temperature=temperature if temperature > 0.0 else 1.0,\n", + " repetition_penalty=repetition_penalty,\n", + " top_p=top_p,\n", + " eos_token_id=[tokenizer.eos_token_id],\n", + " pad_token_id=tokenizer.eos_token_id,\n", + " )\n", + " generate_kwargs = dict(\n", + " streamer=streamer,\n", + " generation_config=generation_config,\n", + " stopping_criteria=stopping_criteria,\n", + " ) | inputs\n", + "\n", + " t1 = Thread(target=model.generate, kwargs=generate_kwargs)\n", + " t1.start()\n", + "\n", + " # Initialize an empty string to store the generated text.\n", + " partial_text = \"\"\n", + " for new_text in streamer:\n", + " partial_text += new_text\n", + " history[-1][1] = partial_text + prompt_char\n", + " # We don't yield the generated text until we are sure it is not the stop string\n", + " pos = partial_text.rfind(stop_str)\n", + " if pos != -1:\n", + " partial_text = partial_text[:pos]\n", + " break\n", + " elif is_partial_stop(partial_text, stop_str):\n", + " continue\n", + " yield (history, 'Status: Generating...')\n", + " history[-1][1] = partial_text\n", + " generation_time = time.perf_counter() - start\n", + " yield (history, f'Generation time: {generation_time:.2f} sec')" + ] + }, + { + "cell_type": "markdown", + "id": "29fe1ae5-9929-4789-9293-612b2062e2a8", + "metadata": {}, + "source": [ + "Next we will create the actual demo using Gradio. The layout will be very simple, a chatbot window followed by a text prompt and some controls.\n", + "We will also include sliders to adjust generation parameters like temperature and length of response we allow the model to generate.\n", + "\n", + "To install Gradio dependency, please uncomment the following cell and run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b61a9a9f", + "metadata": {}, + "outputs": [], + "source": [ + "# ! pip install gradio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ae1aa4e-3539-49a1-8f32-62b818ee1002", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "\n", + "\n", + "EXAMPLES = [\n", + " [\"What is OpenVINO?\"],\n", + " [\"Can you explain to me briefly what is Python programming language?\"],\n", + " [\"Explain the plot of Cinderella in a sentence.\"],\n", + " [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n", + " [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n", + "]\n", + "\n", + "\n", + "def add_user_text(message, history):\n", + " \"\"\"\n", + " Add user's message to chatbot history\n", + "\n", + " Params:\n", + " message: current user message\n", + " history: conversation history\n", + " Returns:\n", + " Updated history, clears user message and status\n", + " \"\"\"\n", + " # Append current user message to history with a blank assistant message which will be generated by the model\n", + " history.append([message, None])\n", + " return ('', history)\n", + "\n", + "\n", + "with gr.Blocks(theme=gr.themes.Soft()) as demo:\n", + " gr.Markdown('

Chat with Phi-2 on Meteor Lake iGPU

')\n", + " chatbot = gr.Chatbot()\n", + " with gr.Row():\n", + " msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n", + " status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=25)\n", + " with gr.Row():\n", + " submit = gr.Button(\"Submit\", variant='primary')\n", + " clear = gr.Button(\"Clear\")\n", + " with gr.Accordion(\"Advanced Options:\", open=False):\n", + " with gr.Row():\n", + " with gr.Column():\n", + " temperature = gr.Slider(\n", + " label=\"Temperature\",\n", + " value=0.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " max_new_tokens = gr.Slider(\n", + " label=\"Max new tokens\",\n", + " value=128,\n", + " minimum=0,\n", + " maximum=512,\n", + " step=32,\n", + " interactive=True,\n", + " )\n", + " with gr.Column():\n", + " top_p = gr.Slider(\n", + " label=\"Top-p (nucleus sampling)\",\n", + " value=1.0,\n", + " minimum=0.0,\n", + " maximum=1.0,\n", + " step=0.05,\n", + " interactive=True,\n", + " )\n", + " repetition_penalty = gr.Slider(\n", + " label=\"Repetition penalty\",\n", + " value=1.0,\n", + " minimum=1.0,\n", + " maximum=2.0,\n", + " step=0.1,\n", + " interactive=True,\n", + " )\n", + " gr.Examples(\n", + " EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n", + " )\n", + "\n", + " # Sets generate function to be triggered when the user submit a new message\n", + " gr.on(\n", + " triggers=[submit.click, msg.submit],\n", + " fn=add_user_text,\n", + " inputs=[msg, chatbot],\n", + " outputs=[msg, chatbot],\n", + " queue=False,\n", + " ).then(\n", + " fn=generate,\n", + " inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty],\n", + " outputs=[chatbot, status],\n", + " concurrency_limit=1,\n", + " queue=True\n", + " )\n", + " \n", + " clear.click(fn=lambda: (None, 'Status: Idle'), inputs=None, outputs=[chatbot, status], queue=False)" + ] + }, + { + "cell_type": "markdown", + "id": "1d1baf09-26f1-40ab-896c-3468b5e89fec", + "metadata": {}, + "source": [ + "That's it, all that is left is to start the demo!\n", + "\n", + "When you're done you can use `demo.close()` to close the demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b73962d-f977-45b7-be3a-32b65e546737", + "metadata": {}, + "outputs": [], + "source": [ + "demo.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e26a0bc-6a78-4185-8b0c-7e9450ba5868", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# demo.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}