From 566e9ba16441b9a40ce8572569f45831ecbd6fe0 Mon Sep 17 00:00:00 2001 From: Yuwen Hu <54161268+Oscilloscope98@users.noreply.github.com> Date: Mon, 2 Sep 2024 20:49:08 +0800 Subject: [PATCH] community: add Intel GPU support to `ipex-llm` llm integration (#22458) **Description:** [IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. This PR adds Intel GPU support to `ipex-llm` llm integration. **Dependencies:** `ipex-llm` **Contribution maintainer**: @ivy-lv11 @Oscilloscope98 **tests and docs**: - Add: langchain/docs/docs/integrations/llms/ipex_llm_gpu.ipynb - Update: langchain/docs/docs/integrations/llms/ipex_llm_gpu.ipynb - Update: langchain/libs/community/tests/llms/test_ipex_llm.py --------- Co-authored-by: ivy-lv11 --- docs/docs/integrations/llms/ipex_llm.ipynb | 313 ++++++++++++++++-- .../text_embedding/ipex_llm_gpu.ipynb | 6 +- .../langchain_community/llms/ipex_llm.py | 14 + .../integration_tests/llms/test_ipex_llm.py | 8 +- 4 files changed, 309 insertions(+), 32 deletions(-) diff --git a/docs/docs/integrations/llms/ipex_llm.ipynb b/docs/docs/integrations/llms/ipex_llm.ipynb index ba456b7608b47..40c366fd1b4ad 100644 --- a/docs/docs/integrations/llms/ipex_llm.ipynb +++ b/docs/docs/integrations/llms/ipex_llm.ipynb @@ -6,16 +6,45 @@ "source": [ "# IPEX-LLM\n", "\n", - "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. \n", + "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency.\n", "\n", - "This example goes over how to use LangChain to interact with `ipex-llm` for text generation. \n" + "- [IPEX-LLM on Intel GPU](#ipex-llm-on-intel-gpu)\n", + "- [IPEX-LLM on Intel CPU](#ipex-llm-on-intel-cpu)\n", + "\n", + "## IPEX-LLM on Intel GPU\n", + "\n", + "This example goes over how to use LangChain to interact with `ipex-llm` for text generation on Intel GPU. \n", + "\n", + "> **Note**\n", + ">\n", + "> It is recommended that only Windows users with Intel Arc A-Series GPU (except for Intel Arc A300-Series or Pro A60) run Jupyter notebook directly for section \"IPEX-LLM on Intel GPU\". For other cases (e.g. Linux users, Intel iGPU, etc.), it is recommended to run the code with Python scripts in terminal for best experiences.\n", + "\n", + "### Install Prerequisites\n", + "To benefit from IPEX-LLM on Intel GPUs, there are several prerequisite steps for tools installation and environment preparation.\n", + "\n", + "If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_windows_gpu.md), and follow [Install Prerequisites](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_windows_gpu.md#install-prerequisites) to update GPU driver (optional) and install Conda.\n", + "\n", + "If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_linux_gpu.md), and follow [**Install Prerequisites**](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_linux_gpu.md#install-prerequisites) to install GPU driver, Intel® oneAPI Base Toolkit 2024.0, and Conda.\n", + "\n", + "### Setup\n", + "\n", + "After the prerequisites installation, you should have created a conda environment with all prerequisites installed. **Start the jupyter service in this conda environment**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain langchain-community" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup" + "Install IEPX-LLM for running LLMs locally on Intel GPU." ] }, { @@ -24,16 +53,41 @@ "metadata": {}, "outputs": [], "source": [ - "# Update Langchain\n", + "%pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> **Note**\n", + ">\n", + "> You can also use `https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/` as the extra-indel-url.\n", "\n", - "%pip install -qU langchain langchain-community" + "### Runtime Configuration\n", + "\n", + "For optimal performance, it is recommended to set several environment variables based on your device:\n", + "\n", + "#### For Windows Users with Intel Core Ultra integrated GPU" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"SYCL_CACHE_PERSISTENT\"] = \"1\"\n", + "os.environ[\"BIGDL_LLM_XMX_DISABLED\"] = \"1\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Install IEPX-LLM for running LLMs locally on Intel CPU." + "#### For Windows Users with Intel Arc A-Series GPU" ] }, { @@ -42,14 +96,23 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --pre --upgrade ipex-llm[all]" + "import os\n", + "\n", + "os.environ[\"SYCL_CACHE_PERSISTENT\"] = \"1\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Usage" + "> **Note**\n", + ">\n", + "> For the first time that each model runs on Intel iGPU/Intel Arc A300-Series or Pro A60, it may take several minutes to compile.\n", + ">\n", + "> For other GPU type, please refer to [here](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Overview/install_gpu.md#runtime-configuration) for Windows users, and [here](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Overview/install_gpu.md#runtime-configuration-1) for Linux users.\n", + "\n", + "\n", + "### Basic Usage\n" ] }, { @@ -88,7 +151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Load the model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference." + "Load the model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference. Set `device` to `\"xpu\"` in `model_kwargs` when initializing IpexLLM in order to load the LLM model to Intel GPU." ] }, { @@ -99,7 +162,12 @@ "source": [ "llm = IpexLLM.from_model_id(\n", " model_id=\"lmsys/vicuna-7b-v1.5\",\n", - " model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n", + " model_kwargs={\n", + " \"temperature\": 0,\n", + " \"max_length\": 64,\n", + " \"trust_remote_code\": True,\n", + " \"device\": \"xpu\",\n", + " },\n", ")" ] }, @@ -107,7 +175,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use it in Chains:" + "Use it in Chains" ] }, { @@ -126,8 +194,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save/Load Low-bit Model\n", - "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step." + "### Save/Load Low-bit Model\n", + "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step. You can similarly set `device` to `\"xpu\"` in `model_kwargs` in order to load the LLM model to Intel GPU. " ] }, { @@ -156,6 +224,209 @@ "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_lowbit = IpexLLM.from_model_id_low_bit(\n", + " model_id=saved_lowbit_model_path,\n", + " tokenizer_id=\"lmsys/vicuna-7b-v1.5\",\n", + " # tokenizer_name=saved_lowbit_model_path, # copy the tokenizers to saved path if you want to use it this way\n", + " model_kwargs={\n", + " \"temperature\": 0,\n", + " \"max_length\": 64,\n", + " \"trust_remote_code\": True,\n", + " \"device\": \"xpu\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the loaded model in Chains:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_chain = prompt | llm_lowbit\n", + "\n", + "\n", + "question = \"What is AI?\"\n", + "output = llm_chain.invoke(question)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IPEX-LLM on Intel CPU\n", + "\n", + "This example goes over how to use LangChain to interact with `ipex-llm` for text generation on Intel CPU.\n", + "\n", + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Update Langchain\n", + "\n", + "%pip install -qU langchain langchain-community" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install IEPX-LLM for running LLMs locally on Intel CPU:\n", + "\n", + "#### For Windows users:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --pre --upgrade ipex-llm[all]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For Linux users:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --pre --upgrade ipex-llm[all] --extra-index-url https://download.pytorch.org/whl/cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "from langchain.chains import LLMChain\n", + "from langchain_community.llms import IpexLLM\n", + "from langchain_core.prompts import PromptTemplate\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*padding_mask.*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the prompt template for your model. In this example, we use the [vicuna-1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) model. If you're working with a different model, choose a proper template accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "template = \"USER: {question}\\nASSISTANT:\"\n", + "prompt = PromptTemplate(template=template, input_variables=[\"question\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = IpexLLM.from_model_id(\n", + " model_id=\"lmsys/vicuna-7b-v1.5\",\n", + " model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use it in Chains:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm_chain = prompt | llm\n", + "\n", + "question = \"What is AI?\"\n", + "output = llm_chain.invoke(question)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save/Load Low-bit Model\n", + "\n", + "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step.\n", + "\n", + "To save the low-bit model, use `save_low_bit` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "saved_lowbit_model_path = \"./vicuna-7b-1.5-low-bit\" # path to save low-bit model\n", + "llm.model.save_low_bit(saved_lowbit_model_path)\n", + "del llm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the model from saved lowbit model path as follows.\n", + "\n", + "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved." + ] + }, { "cell_type": "code", "execution_count": null, @@ -192,22 +463,8 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" + "name": "python" } }, "nbformat": 4, diff --git a/docs/docs/integrations/text_embedding/ipex_llm_gpu.ipynb b/docs/docs/integrations/text_embedding/ipex_llm_gpu.ipynb index 1e051e71aa665..cca30c6745eea 100644 --- a/docs/docs/integrations/text_embedding/ipex_llm_gpu.ipynb +++ b/docs/docs/integrations/text_embedding/ipex_llm_gpu.ipynb @@ -17,9 +17,9 @@ "## Install Prerequisites\n", "To benefit from IPEX-LLM on Intel GPUs, there are several prerequisite steps for tools installation and environment preparation.\n", "\n", - "If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html), and follow [Install Prerequisites](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_windows_gpu.html#install-prerequisites) to update GPU driver (optional) and install Conda.\n", + "If you are a Windows user, visit the [Install IPEX-LLM on Windows with Intel GPU Guide](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_windows_gpu.md), and follow [Install Prerequisites](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_windows_gpu.md#install-prerequisites) to update GPU driver (optional) and install Conda.\n", "\n", - "If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html), and follow [**Install Prerequisites**](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Quickstart/install_linux_gpu.html#install-prerequisites) to install GPU driver, Intel® oneAPI Base Toolkit 2024.0, and Conda.\n", + "If you are a Linux user, visit the [Install IPEX-LLM on Linux with Intel GPU](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_linux_gpu.md), and follow [**Install Prerequisites**](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_linux_gpu.md#install-prerequisites) to install GPU driver, Intel® oneAPI Base Toolkit 2024.0, and Conda.\n", "\n", "## Setup\n", "\n", @@ -105,7 +105,7 @@ ">\n", "> For the first time that each model runs on Intel iGPU/Intel Arc A300-Series or Pro A60, it may take several minutes to compile.\n", ">\n", - "> For other GPU type, please refer to [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#runtime-configuration) for Windows users, and [here](https://ipex-llm.readthedocs.io/en/latest/doc/LLM/Overview/install_gpu.html#id5) for Linux users.\n", + "> For other GPU type, please refer to [here](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Overview/install_gpu.md#runtime-configuration) for Windows users, and [here](https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Overview/install_gpu.md#runtime-configuration-1) for Linux users.\n", "\n", "\n", "## Basic Usage\n", diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py index 3831e3e89922d..c1f7f40c4e887 100644 --- a/libs/community/langchain_community/llms/ipex_llm.py +++ b/libs/community/langchain_community/llms/ipex_llm.py @@ -139,6 +139,16 @@ def _load_model( kwargs = kwargs or {} _tokenizer_id = tokenizer_id or model_id + # Set "cpu" as default device + if "device" not in _model_kwargs: + _model_kwargs["device"] = "cpu" + + if _model_kwargs["device"] not in ["cpu", "xpu"]: + raise ValueError( + "IpexLLMBgeEmbeddings currently only supports device to be " + f"'cpu' or 'xpu', but you have: {_model_kwargs['device']}." + ) + device = _model_kwargs.pop("device") try: tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs) @@ -186,6 +196,8 @@ def _load_model( model_kwargs=_model_kwargs, ) + model.to(device) + return cls( model_id=model_id, model=model, @@ -235,6 +247,7 @@ def _call( from transformers import TextStreamer input_ids = self.tokenizer.encode(prompt, return_tensors="pt") + input_ids = input_ids.to(self.model.device) streamer = TextStreamer( self.tokenizer, skip_prompt=True, skip_special_tokens=True ) @@ -261,6 +274,7 @@ def _call( return text else: input_ids = self.tokenizer.encode(prompt, return_tensors="pt") + input_ids = input_ids.to(self.model.device) if stop is not None: from transformers.generation.stopping_criteria import ( StoppingCriteriaList, diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py index 89c1d6ee07ecc..0fc2b5caa5331 100644 --- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py +++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py @@ -13,12 +13,18 @@ not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set." ) model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")] # type: ignore +device = os.getenv("TEST_IPEXLLM_MODEL_DEVICE") or "cpu" def load_model(model_id: str) -> Any: llm = IpexLLM.from_model_id( model_id=model_id, - model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True}, + model_kwargs={ + "temperature": 0, + "max_length": 16, + "trust_remote_code": True, + "device": device, + }, ) return llm