diff --git a/.github/actions/ollama-setup/action.yml b/.github/actions/ollama-setup/action.yml new file mode 100644 index 0000000..6d328db --- /dev/null +++ b/.github/actions/ollama-setup/action.yml @@ -0,0 +1,15 @@ +name: 'Additional Test Setup' +description: 'Additional Test Setup for Ollama' +runs: + using: "composite" + steps: + - name: Install Ollama and start server + shell: bash + run: | + curl -fsSL https://ollama.com/install.sh | sudo -E sh + + - name: Wait for Ollama server + shell: bash + run: | + sleep 10 + time curl -i http://localhost:11434 diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4de764d..136be42 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,5 +7,6 @@ updates: - package-ecosystem: "github-actions" directories: - "/" + - ".github/actions/ollama-setup" schedule: interval: "daily" diff --git a/.github/notebook_lists/ollama_notebooks.txt b/.github/notebook_lists/ollama_notebooks.txt new file mode 100644 index 0000000..8d7ac91 --- /dev/null +++ b/.github/notebook_lists/ollama_notebooks.txt @@ -0,0 +1 @@ +recipes/Intrinsics/Granite_RAG_LoRA.ipynb diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml index 641e7e3..45eb3f0 100644 --- a/.github/workflows/notebooks.yaml +++ b/.github/workflows/notebooks.yaml @@ -113,11 +113,27 @@ jobs: ref: ${{ needs.test_workflow_ready.outputs.ref }} all: ${{ fromJSON(needs.test_workflow_ready.outputs.all) }} + ollama: + needs: + - test_workflow_ready + permissions: + contents: read + uses: ibm-granite-community/utils/.github/workflows/test_notebook.yaml@main + with: + notebook-lists: | + .github/notebook_lists/ollama_notebooks.txt + python-versions: >- + 3.11 + ref: ${{ needs.test_workflow_ready.outputs.ref }} + all: ${{ fromJSON(needs.test_workflow_ready.outputs.all) }} + action: .github/actions/ollama-setup + test_workflow_complete: needs: - test_workflow_ready - vanilla - pdl + - ollama permissions: pull-requests: write statuses: write diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 77f0917..69bdba2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ # Run `pre-commit install` to activate pre-commit hooks repos: - repo: https://github.com/kynan/nbstripout - rev: 0.7.1 + rev: 0.8.1 hooks: - id: nbstripout args: [ diff --git a/README.md b/README.md index 45d8ea8..8ec7635 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,12 @@ The "Recipes" in the Granite Snack Cookbook showcase the essential capabilities Open In Colab --> (Not yet available in Colab, because the latter uses Python 3.10) +### Intrinsic Functions + +1. Retrieval Augmented Generation (RAG) with Granite RAG 3.0 8b + 1. [Notebook that uses Ollama](recipes/Intrinsics/Granite_RAG_LoRA.ipynb) (Not available in Colab, because it requires a local Ollama server.) + 1. [Notebook that uses Hugging Face Transformers and PEFT Libraries](recipes/Intrinsics/Granite_RAG_LoRA_HF.ipynb) (Not available in Colab, because it requires a >16GB memory) + ### Tuning 1. [Fine Tuning Granite](recipes/Fine_Tuning/Finetuning_Granite_Pirate_Style.ipynb) @@ -89,4 +95,4 @@ All content in these repositories including code has been provided by IBM under [CoC]: https://github.com/ibm-granite-cookbooks/community/blob/main/CODE_OF_CONDUCT.md [CG]: https://github.com/ibm-granite-cookbooks/community/blob/main/CONTRIBUTING.md [CG-legal]: https://github.com/ibm-granite-cookbooks/community/blob/main/CONTRIBUTING.md#legal -[CG-signing]: https://github.com/ibm-granite-cookbooks/community/blob/main/CONTRIBUTING.md#signing-commits \ No newline at end of file +[CG-signing]: https://github.com/ibm-granite-cookbooks/community/blob/main/CONTRIBUTING.md#signing-commits diff --git a/recipes/Intrinsics/.gitignore b/recipes/Intrinsics/.gitignore new file mode 100644 index 0000000..a601498 --- /dev/null +++ b/recipes/Intrinsics/.gitignore @@ -0,0 +1,3 @@ +llama.cpp/ +*.gguf +Modelfile diff --git a/recipes/Intrinsics/Granite_RAG_LoRA.ipynb b/recipes/Intrinsics/Granite_RAG_LoRA.ipynb new file mode 100644 index 0000000..c75331a --- /dev/null +++ b/recipes/Intrinsics/Granite_RAG_LoRA.ipynb @@ -0,0 +1,676 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Retrieval Augmented Generation (RAG) with Granite RAG 3.0 8b using Ollama\n", + "\n", + "*Using IBM Granite Models*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## In this notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook contains instructions for performing Retrieval Augmented Generation (RAG) using the Granite RAG 3.0 8b LoRA adapter using Ollama.\n", + "\n", + "RAG is an architectural pattern that can be used to augment the performance of language models by recalling factual information from a knowledge base, and adding that information to the model query. The most common approach in RAG is to create dense vector representations of the knowledge base in order to retrieve text chunks that are semantically similar to a given user query.\n", + "\n", + "The Granite RAG 3.0 8b adds hallucination detection and citation generation capability.\n", + "\n", + "RAG use cases include:\n", + "- Customer service: Answering questions about a product or service using facts from the product documentation.\n", + "- Domain knowledge: Exploring a specialized domain (e.g., finance) using facts from papers or articles in the knowledge base.\n", + "- News chat: Chatting about current events by calling up relevant recent news articles.\n", + "\n", + "In its simplest form, RAG requires 3 steps:\n", + "\n", + "- Initial setup:\n", + " - Index knowledge-base passages for efficient retrieval. In this recipe, we take embeddings of the passages using WatsonX, and store them in a vector database.\n", + "- Upon each user query:\n", + " - Retrieve relevant passages from the database. In this recipe, we use an embedding of the query to retrieve semantically similar passages.\n", + " - Generate a response by feeding retrieved passage into a large language model, along with the user query." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setting up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ensure you are running python 3.10 or 3.11 in a freshly-created virtual environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "assert sys.version_info >= (3, 10) and sys.version_info < (3, 12), \"Use Python 3.10 or 3.11 to run this notebook.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Install dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The LoRA adapter for Granite RAG 3.0 8b is available from [Hugging Face](https://huggingface.co/ibm-granite/granite-rag-3.0-8b-lora). In this recipe, we need to download it and convert the format to GGUF for use with Ollama.\n", + "\n", + "Granite utils includes some helpful functions. We also use the llama.cpp project for GGUF file conversion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "! git clone --depth 1 https://github.com/ggerganov/llama.cpp.git\n", + "! pip install \"git+https://github.com/ibm-granite-community/utils.git\" \\\n", + " \"huggingface_hub\" \\\n", + " \"langchain_community\" \\\n", + " \"langchain_ollama\" \\\n", + " langchain-milvus \\\n", + " docling \\\n", + " -r llama.cpp/requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting System Components" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Setup Granite with the Granite RAG LoRA adapter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Granite RAG 3.0 8b LoRA adapter is built for the Granite 3.0 8b Instruct model.\n", + "\n", + "We will use Ollama to configure Granite 3.0 8b Instruct as the base model and Granite RAG 3.0 8b as the LoRA adapter.\n", + "\n", + "First make sure you have Ollama installed and `ollama serve` is running. Then we will download the Granite 3.0 8b Instruct model to use as the base model. The `granite3-dense:8b-instruct-fp16` model is in `fp16` format and is about 16 GB in size. The `granite3-dense:8b` model is quantized down to 4-bit format and is about 5 GB in size.\n", + "So you can use either model depending on how much memory you have available on your system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use fp16 or quantized model\n", + "fp16 = False\n", + "if fp16:\n", + " granite3_model = 'granite3-dense:8b-instruct-fp16'\n", + " granite3_rag_model = 'granite3-rag:8b-fp16'\n", + "else:\n", + " granite3_model = 'granite3-dense:8b'\n", + " granite3_rag_model = 'granite3-rag:8b'\n", + "! ollama pull {granite3_model}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also need to convert the LoRA adapter to fp16 in the GGUF format from safetensors. So we will download the safetensors of the LoRA adapter and the safetensors configuration of the base model using the `huggingface_hub` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import huggingface_hub\n", + "\n", + "lora_folder = huggingface_hub.snapshot_download(repo_id=\"ibm-granite/granite-rag-3.0-8b-lora\")\n", + "base_folder = huggingface_hub.snapshot_download(repo_id=\"ibm-granite/granite-3.0-8b-instruct\", allow_patterns=\"*.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The download commands will display the folders into which the safetensors were downloaded. These folder names are needed for the conversion command. We will use the `convert_lora_to_gguf.py` command from the `llama.cpp` project to convert the LoRA adapter.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lora_gguf = \"granite-rag-3.0-8b-lora-fp16.gguf\"\n", + "!python3 llama.cpp/convert_lora_to_gguf.py --outtype f16 --outfile {lora_gguf} --base {base_folder} -- {lora_folder}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Create a `Modelfile` for Ollama to use the base model and the LoRA adapter together. This will use the base model we previously pulled along with the GGUF version of the LoRA adapter we just created.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"Modelfile\", \"w\") as modelfile:\n", + " modelfile.write(f\"\"\"\\\n", + "FROM {granite3_model}\n", + "ADAPTER {lora_gguf}\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Finally, we create a model in Ollama." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! ollama create {granite3_rag_model} -f Modelfile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Now we can use Granite RAG 3.0 8b!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choose your Embeddings Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the model to use for generating embedding vectors from text.\n", + "\n", + "You will need to download the embeddings model. First make sure you have Ollama installed and `ollama serve` is running." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = \"nomic-embed-text\"\n", + "! ollama pull {embeddings}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "To use a model from another provider, replace this code cell with one from [this Embeddings Model recipe](https://github.com/ibm-granite-community/granite-kitchen/blob/main/recipes/Components/Langchain_Embeddings_Models.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_ollama.embeddings import OllamaEmbeddings\n", + "\n", + "embeddings_model = OllamaEmbeddings(model=embeddings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choose your Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the database to use for storing and retrieving embedding vectors.\n", + "\n", + "To connect to a vector database other than Milvus, replace this code cell with one from [this Vector Store recipe](https://github.com/ibm-granite-community/granite-kitchen/blob/main/recipes/Components/Langchain_Vector_Stores.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_milvus import Milvus\n", + "import tempfile\n", + "\n", + "db_file = tempfile.NamedTemporaryFile(prefix=\"milvus_\", suffix=\".db\", delete=False).name\n", + "print(f\"The vector database will be saved to {db_file}\")\n", + "\n", + "vector_db = Milvus(\n", + " embedding_function=embeddings_model,\n", + " connection_args={\"uri\": db_file},\n", + " auto_id=True,\n", + " enable_dynamic_field=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use the Granite RAG 3.0 8b model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a model object for the Granite RAG 3.0 8b model in Ollama." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_ollama.llms import OllamaLLM\n", + "\n", + "model = OllamaLLM(model=granite3_rag_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Building the Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, from a set of source documents, we use [Docling](https://github.com/DS4SD/docling) to convert the documents into text and then split the text into chunks, derive embedding vectors using the embedding model, and load it into the vector database for querying." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Docling to download the documents, convert to text, and split into chunks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Here we use a set of web pages about IBM and the US Open. For each source web page, we convert the web page into a DoclingDocument and then chunk the DoclingDocument. Finally LangChain Documents are created for all the chunks labeled text or paragraph. The Documents are annotated with metadata to define a unique document id and the source of the document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker\n", + "from docling_core.types.doc.labels import DocItemLabel\n", + "from langchain_core.documents import Document\n", + "\n", + "sources = [\n", + " \"https://www.ibm.com/case-studies/us-open\",\n", + " \"https://www.ibm.com/sports/usopen\",\n", + " \"https://newsroom.ibm.com/US-Open-AI-Tennis-Fan-Engagement\",\n", + " \"https://newsroom.ibm.com/2024-08-15-ibm-and-the-usta-serve-up-new-and-enhanced-generative-ai-features-for-2024-us-open-digital-platforms\",\n", + "]\n", + "\n", + "converter = DocumentConverter()\n", + "i = 0\n", + "texts: list[Document] = [\n", + " Document(page_content=chunk.text, metadata={\"doc_id\": (i:=i+1), \"source\": source})\n", + " for source in sources\n", + " for chunk in HierarchicalChunker().chunk(converter.convert(source=source).document)\n", + " if any(filter(lambda c: c.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH], iter(chunk.meta.doc_items)))\n", + "]\n", + "\n", + "print(f\"{len(texts)} documents created\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Populate the vector database\n", + "\n", + "NOTE: Population of the vector database may take over a minute depending on your embedding model and service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ids = vector_db.add_documents(texts)\n", + "print(f\"{len(ids)} documents added to the vector database\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Querying the Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define the query to use for the RAG operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"How did IBM use watsonx at the 2024 US Open Tennis Championship?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conduct a similarity search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Search the database for similar documents by proximity of the embedded vector in vector space to demonstrate the similarity search used during the RAG operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = vector_db.similarity_search(query)\n", + "print(f\"{len(docs)} documents returned\")\n", + "for d in docs:\n", + " print(f\"doc_id={d.metadata['doc_id']} {d.page_content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Answering Questions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the prompt for Granite RAG 3.0 8b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For Granite RAG 3.0 8b, we construct the prompt in a specific JSON format which includes the retrieved documents and metadata about the information to be included in the response. The values for `input` (the question), `hallucination_tags`, and `citations` are supplied when the chain is invoked." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "# Create a prompt template for question-answering with the retrieved context.\n", + "prompt_template = PromptTemplate.from_template(template=\"\"\"<|start_of_role|>system<|end_of_role|>\\\n", + "{{\n", + " \"instruction\": \"Respond to the user's latest question based solely on the information provided in the documents.\n", + "Ensure that your response is strictly aligned with the facts in the provided documents.\n", + "If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n", + "Make sure that your response follows the attributes mentioned in the 'meta' field.\",\n", + " \"documents\": [{context}],\n", + " \"meta\": {{\n", + " \"hallucination_tags\": {hallucination_tags},\n", + " \"citations\": {citations}\n", + " }}\n", + "}}<|end_of_text|>\n", + "<|start_of_role|>user<|end_of_role|>{input}\"\"\")\n", + "\n", + "# Create a document prompt template to wrap each retrieved document\n", + "document_prompt_template = PromptTemplate.from_template(template=\"\"\"\\\n", + "{{\"doc_id\": {doc_id}, \"text\": \"{page_content}\"}}\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Automate the RAG pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We now build a RAG chain with the model and the document retriever and the prompts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.retrieval import create_retrieval_chain\n", + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "\n", + "# Assemble the retrieval-augmented generation chain.\n", + "combine_docs_chain = create_stuff_documents_chain(model, prompt_template, document_prompt=document_prompt_template, document_separator=\",\")\n", + "rag_chain = create_retrieval_chain(vector_db.as_retriever(), combine_docs_chain)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Generate a retrieval-augmented response to a question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Use the RAG chain to process a question. The document chunks relevant to that question are retrieved and used as context. The response from Granite RAG 3.0 8b in a JSON document. This cell then parses the JSON document to retrieve the sentences of the response along with metadata about the sentence which can be used to guide the displayed output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import json\n", + "from langchain_core.utils.json import parse_json_markdown\n", + "\n", + "output = rag_chain.invoke({\"input\": query, \"hallucination_tags\": \"true\", \"citations\": \"false\"})\n", + "\n", + "print(f\"Question:\\n{output['input']}\")\n", + "print(\"\\nAnswer:\")\n", + "try:\n", + " responses = parse_json_markdown(output['answer'])\n", + " need_footnote = False\n", + " for response in responses:\n", + " sentence = response.get(\"sentence\")\n", + " meta = response.get(\"meta\", {})\n", + " hallucination_level = meta.get(\"hallucination_level\", \"low\")\n", + " match hallucination_level:\n", + " case \"low\" | \"unanswerable\":\n", + " print(sentence)\n", + " case \"high\" | _:\n", + " need_footnote = True\n", + " print(sentence, \"¹\", sep=\"\")\n", + " if need_footnote:\n", + " print(\"\\n¹ Warning: the sentence was not generated using the retrieved documents.\")\n", + " print(\"\\nOriginal response in JSON format:\")\n", + " print(json.dumps(responses, indent=2))\n", + "except json.JSONDecodeError:\n", + " print(\"\\nOriginal response which was unable to be parsed as JSON:\")\n", + " print(output['answer'])" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/recipes/Intrinsics/Granite_RAG_LoRA_HF.ipynb b/recipes/Intrinsics/Granite_RAG_LoRA_HF.ipynb new file mode 100644 index 0000000..12d96a3 --- /dev/null +++ b/recipes/Intrinsics/Granite_RAG_LoRA_HF.ipynb @@ -0,0 +1,515 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "# Retrieval Augmented Generation (RAG) with Granite RAG 3.0 8b using Hugging Face Transformers and PEFT Libraries\n", + "\n", + "*Using IBM Granite Models*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## In this notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook contains instructions for performing Retrieval Augmented Generation (RAG) using the [Granite RAG 3.0 8b LoRA adapter](https://huggingface.co/ibm-granite/granite-rag-3.0-8b-lora) using the Hugging Face Transformers and PEFT libraries.\n", + "\n", + "RAG is an architectural pattern that can be used to augment the performance of language models by recalling factual information from a knowledge base, and adding that information to the model query. The most common approach in RAG is to create dense vector representations of the knowledge base in order to retrieve text chunks that are semantically similar to a given user query.\n", + "\n", + "The Granite RAG 3.0 8b adds hallucination detection and citation generation capability.\n", + "\n", + "RAG use cases include:\n", + "- Customer service: Answering questions about a product or service using facts from the product documentation.\n", + "- Domain knowledge: Exploring a specialized domain (e.g., finance) using facts from papers or articles in the knowledge base.\n", + "- News chat: Chatting about current events by calling up relevant recent news articles.\n", + "\n", + "In its simplest form, RAG requires 3 steps:\n", + "\n", + "- Initial setup:\n", + " - Index knowledge-base passages for efficient retrieval. In this recipe, we take embeddings of the passages using WatsonX, and store them in a vector database.\n", + "- Upon each user query:\n", + " - Retrieve relevant passages from the database. In this recipe, we use an embedding of the query to retrieve semantically similar passages.\n", + " - Generate a response by feeding retrieved passage into a large language model, along with the user query." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Setting up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ensure you are running python 3.10 or 3.11 in a freshly-created virtual environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "assert sys.version_info >= (3, 10) and sys.version_info < (3, 12), \"Use Python 3.10 or 3.11 to run this notebook.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Install dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Granite utils includes some helpful functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "! pip install \"git+https://github.com/ibm-granite-community/utils.git\" \\\n", + " transformers \\\n", + " peft \\\n", + " \"langchain_community\" \\\n", + " langchain-huggingface \\\n", + " langchain-milvus \\\n", + " docling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting System Components" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choose your Embeddings Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the model to use for generating embedding vectors from text.\n", + "\n", + "To use a model from another provider, replace this code cell with one from [this Embeddings Model recipe](https://github.com/ibm-granite-community/granite-kitchen/blob/main/recipes/Components/Langchain_Embeddings_Models.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "\n", + "embeddings_model = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choose your Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify the database to use for storing and retrieving embedding vectors.\n", + "\n", + "To connect to a vector database other than Milvus, replace this code cell with one from [this Vector Store recipe](https://github.com/ibm-granite-community/granite-kitchen/blob/main/recipes/Components/Langchain_Vector_Stores.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_milvus import Milvus\n", + "import tempfile\n", + "\n", + "db_file = tempfile.NamedTemporaryFile(prefix=\"milvus_\", suffix=\".db\", delete=False).name\n", + "print(f\"The vector database will be saved to {db_file}\")\n", + "\n", + "vector_db = Milvus(\n", + " embedding_function=embeddings_model,\n", + " connection_args={\"uri\": db_file},\n", + " auto_id=True,\n", + " enable_dynamic_field=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use the Granite RAG 3.0 8b model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a model object for the Granite RAG 3.0 8b model on your workstation. This can take quite a bit of memory (> 16 GB)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "from peft.peft_model import PeftModel\n", + "import torch\n", + "\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained('ibm-granite/granite-3.0-8b-instruct', padding_side='left', trust_remote_code=True)\n", + "\n", + "model_base = AutoModelForCausalLM.from_pretrained('ibm-granite/granite-3.0-8b-instruct')\n", + "model_lora = PeftModel.from_pretrained(model_base, 'ibm-granite/granite-rag-3.0-8b-lora')\n", + "model = model_lora.to(device)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Building the Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, from a set of source documents, we use [Docling](https://github.com/DS4SD/docling) to convert the documents into text and then split the text into chunks, derive embedding vectors using the embedding model, and load it into the vector database for querying." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Docling to download the documents, convert to text, and split into chunks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Here we use a set of web pages about IBM and the US Open. For each source web page, we convert the web page into a DoclingDocument and then chunk the DoclingDocument. Finally LangChain Documents are created for all the chunks labeled text or paragraph. The Documents are annotated with metadata to define a unique document id and the source of the document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker\n", + "from docling_core.types.doc.labels import DocItemLabel\n", + "from langchain_core.documents import Document\n", + "\n", + "sources = [\n", + " \"https://www.ibm.com/case-studies/us-open\",\n", + " \"https://www.ibm.com/sports/usopen\",\n", + " \"https://newsroom.ibm.com/US-Open-AI-Tennis-Fan-Engagement\",\n", + " \"https://newsroom.ibm.com/2024-08-15-ibm-and-the-usta-serve-up-new-and-enhanced-generative-ai-features-for-2024-us-open-digital-platforms\",\n", + "]\n", + "\n", + "converter = DocumentConverter()\n", + "i = 0\n", + "texts: list[Document] = [\n", + " Document(page_content=chunk.text, metadata={\"doc_id\": (i:=i+1), \"source\": source})\n", + " for source in sources\n", + " for chunk in HierarchicalChunker().chunk(converter.convert(source=source).document)\n", + " if any(filter(lambda c: c.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH], iter(chunk.meta.doc_items)))\n", + "]\n", + "\n", + "print(f\"{len(texts)} documents created\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Populate the vector database\n", + "\n", + "NOTE: Population of the vector database may take over a minute depending on your embedding model and service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ids = vector_db.add_documents(texts)\n", + "print(f\"{len(ids)} documents added to the vector database\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Querying the Vector Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define the query to use for the RAG operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"How did IBM use watsonx at the 2024 US Open Tennis Championship?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conduct a similarity search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Search the database for similar documents by proximity of the embedded vector in vector space to demonstrate the similarity search used during the RAG operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = vector_db.similarity_search(query)\n", + "print(f\"{len(docs)} documents returned\")\n", + "for d in docs:\n", + " print(f\"doc_id={d.metadata['doc_id']} {d.page_content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Answering Questions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the prompt for Granite RAG 3.0 8b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For Granite RAG 3.0 8b, we construct the prompt in a specific JSON format which includes the retrieved documents and metadata about the information to be included in the response." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "hallucination_tags = \"true\"\n", + "citations = \"false\"\n", + "instruction = {\n", + " \"instruction\": \"\"\"Respond to the user's latest question based solely on the information provided in the documents.\n", + "Ensure that your response is strictly aligned with the facts in the provided documents.\n", + "If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.\n", + "Make sure that your response follows the attributes mentioned in the 'meta' field.\"\"\",\n", + " \"documents\": [\n", + " {\n", + " \"doc_id\": d.metadata['doc_id'],\n", + " \"text\": d.page_content,\n", + " }\n", + " for d in docs\n", + " ],\n", + " \"meta\": {\n", + " \"hallucination_tags\": hallucination_tags,\n", + " \"citations\": citations,\n", + " },\n", + "}\n", + "conversation = [\n", + " {\"role\": \"system\", \"content\": json.dumps(instruction)},\n", + " {\"role\": \"user\", \"content\": query},\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Generate a retrieval-augmented response to a question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Use the documents from the similarity search are used as context. The response from Granite RAG 3.0 8b in a JSON document. This cell then parses the JSON document to retrieve the sentences of the response along with metadata about the sentence which can be used to guide the displayed output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from langchain_core.utils.json import parse_json_markdown\n", + "\n", + "prompt = tokenizer.apply_chat_template(conversation=conversation, tokenize=False, add_generation_prompt=True)\n", + "inputs = tokenizer(prompt, return_tensors=\"pt\")\n", + "output = model.generate(inputs[\"input_ids\"].to(device), attention_mask=inputs[\"attention_mask\"].to(device), max_new_tokens=1000)\n", + "answer = tokenizer.decode(output[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n", + "\n", + "print(f\"Question:\\n{query}\")\n", + "print(\"\\nAnswer:\")\n", + "try:\n", + " responses = parse_json_markdown(answer)\n", + " need_footnote = False\n", + " for response in responses:\n", + " sentence = response.get(\"sentence\")\n", + " meta = response.get(\"meta\", {})\n", + " hallucination_level = meta.get(\"hallucination_level\", \"low\")\n", + " match hallucination_level:\n", + " case \"low\" | \"unanswerable\":\n", + " print(sentence)\n", + " case \"high\" | _:\n", + " need_footnote = True\n", + " print(sentence, \"¹\", sep=\"\")\n", + " if need_footnote:\n", + " print(\"\\n¹ Warning: the sentence was not generated using the retrieved documents.\")\n", + " print(\"\\nOriginal response in JSON format:\")\n", + " print(json.dumps(responses, indent=2))\n", + "except json.JSONDecodeError:\n", + " print(\"\\nOriginal response which was unable to be parsed as JSON:\")\n", + " print(answer)" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}