diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index 69eca7add1..16ffe45642 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -144,7 +144,7 @@ def _get_dataloader(): input_data = self.optData return [ ( - 0, + BATCH_SIZE_DEFAULT, ( input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}, diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb new file mode 100644 index 0000000000..bb9575d250 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", + "\n", + "We’ll cover the following key stages:\n", + "\n", + "1. Load PDF documents using PyMUPDF library.\n", + "2. Use SynapseML to split the documents into chunks.\n", + "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", + "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", + "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", + "\n", + "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 1: Define the notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import fitz\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.types import ArrayType, FloatType, StringType\n", + "from pyspark.sql.functions import (\n", + " explode,\n", + " col,\n", + " monotonically_increasing_id,\n", + " concat_ws,\n", + " collect_list,\n", + ")\n", + "from pyspark.ml.functions import predict_batch_udf\n", + "from sentence_transformers import SentenceTransformer\n", + "from synapse.ml.featurize.text import PageSplitter\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define the function to extract text from binary PDF data\n", + "def extract_text_from_binary_pdf(binary_content):\n", + " try:\n", + " # Create a PyMuPDF document from the binary data\n", + " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text\n", + " except Exception as e:\n", + " return str(e)\n", + "\n", + "\n", + "# Register the function as a UDF\n", + "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", + "\n", + "\n", + "# Apply the UDF to extract text from the binary content\n", + "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "# Add id column\n", + "\n", + "exploded_df = (\n", + " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", + " .select(\"path\", \"chunk\")\n", + " .withColumn(\"id\", monotonically_increasing_id())\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define a function to create the encode_udf with a custom query_prefix\n", + "def create_encode_udf(query_prefix):\n", + " # Define a function to encode text in batches\n", + " # def encode_text_batch(texts):\n", + " def encode_text_batch():\n", + " # Load the model inside the function\n", + " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", + " model.max_seq_length = 4096\n", + " model.tokenizer.padding_side = \"right\"\n", + "\n", + " def predict(inputs):\n", + "\n", + " output = model.encode(\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", + " )\n", + " return output\n", + "\n", + " return predict\n", + "\n", + " # # Encode the texts in batch\n", + " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", + " # return [embedding.tolist() for embedding in embeddings]\n", + "\n", + " # Define the predict_batch_udf with the above function\n", + " return predict_batch_udf(\n", + " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Use it withhout query_prefix in this case\n", + "query_prefix = \"\"\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "# Applying the UDF to a DataFrame chunk column\n", + "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=2)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Compose a Question." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "task_name_to_instruct = {\n", + " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", + "}\n", + "\n", + "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", + "\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "# Define schema explicitly\n", + "schema = StructType(\n", + " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", + ")\n", + "\n", + "# Create DataFrame with id = 1 and the user query\n", + "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", + "\n", + "# Apply the UDF to generate the embeddings\n", + "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 7: Find chunks with the closest context to the question using embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", + " query_embeddings.select(\"id\", \"embeddings\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Add text to the results\n", + "result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", + "concatenated_text = result_df.agg(\n", + " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", + ").collect()[0][\"concatenated_text\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", + "\n", + "# Put model in global if we want to reuse it\n", + "global llm\n", + "\n", + "if \"llm\" in globals() and llm is not None:\n", + " print(\"Model is already loaded.\")\n", + "else:\n", + " print(\"Model is not loaded.\")\n", + "\n", + " # Extend model input sizes\n", + " build_config = BuildConfig()\n", + " build_config.plugin_config.context_fmha = True\n", + " build_config.max_input_len = 5120\n", + " build_config.max_seq_len = 5632\n", + "\n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", + "\n", + "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", + "\n", + "context = concatenated_text\n", + "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "\n", + "prompt = f\"\"\"\n", + "context: {context}\n", + "Answer the question based only on the context above. Without multiple choices. If the\n", + "information to answer the question is not present in the given context then reply \"I don't know\".\n", + "My Question: {query}\n", + "What is your Answer? \"\"\"\n", + "\n", + "outputs = llm.generate(prompt, sampling_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 9: Print LLM results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7fb3394-289f-4949-835e-3520323a770d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "output_text = outputs.outputs[0].text\n", + "\n", + "# Split the text by '\\n'\n", + "split_text = output_text.split(\"\\n\")\n", + "\n", + "for item in split_text:\n", + " if len(item) > 10:\n", + " # Split the item at the colon and take the part after it\n", + " result = item.split(\":\", 1)[-1].strip()\n", + " print(\"Answer: \" + result)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index e979110a30..1d499027d5 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", "showTitle": false, @@ -15,9 +12,10 @@ } }, "source": [ - "# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n", + "# Embedding with local (per node) NVIDIA TensorRT accelerator and GPU based Approximate Nearest Neighbor (ANN)\n", "\n", - "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", + "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb). Now encoding is processed by local embedders from Hugging Face and KNN is using GPU accelerated approximate method using IVFFlat algorithm. All tutorial stages accelerated by NVIDIA GPU using NVIDIA TensorRT and Spark Rapids ML.\n", + " All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previous CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n" ] @@ -26,10 +24,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", "showTitle": false, @@ -77,10 +72,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, @@ -162,10 +154,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", "showTitle": false, @@ -211,10 +200,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6885033f-6eea-4338-a632-2837582d91a1", "showTitle": false, @@ -266,10 +252,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0154ce06-5875-4236-8178-030d45091445", "showTitle": false, @@ -345,10 +328,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb", "showTitle": false, @@ -392,10 +372,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48", "showTitle": false, @@ -451,10 +428,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862", "showTitle": false, @@ -466,7 +440,7 @@ "\n", "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", "\n", - "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", + "This is the comparison duration results on 10 T4 GPU nodes for both approaches:\n", "\n", "![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png)\n", "\n", diff --git a/pipeline.yaml b/pipeline.yaml index 22c581587e..d34bce6a8f 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -494,14 +494,6 @@ jobs: condition: succeededOrFailed() - template: templates/codecov.yml -- job: BuildAndCacheCondaEnv - cancelTimeoutInMinutes: 0 - condition: eq(variables.runTests, 'True') - pool: - vmImage: ubuntu-20.04 - steps: - - template: templates/conda.yml - - bash: df -H - job: WebsiteSamplesTests cancelTimeoutInMinutes: 0 diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index bcb8fdc93e..f8dd710ce1 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -16,7 +16,7 @@ # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.4.0 +RAPIDS_VERSION=24.6.0 SPARK_RAPIDS_VERSION=23.10.0 SPARK_RAPIDSML_VERSION=24.6.0 @@ -46,3 +46,16 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda # install spark-rapids-ml /databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} + +# install TRT-LLM +/databricks/python/bin/pip install --upgrade cython +/databricks/python/bin/pip install --pre --no-build-isolation --extra-index-url https://pypi.nvidia.com mpi4py +# /databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm==0.12.0.dev2024073000 +/databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm + +# Required by NY-Embed +/databricks/python/bin/pip install --upgrade sentence-transformers +/databricks/python/bin/pip install transformers + +# To work with PDF +/databricks/python/bin/pip install PyMuPDF diff --git a/website/sidebars.js b/website/sidebars.js index 5ef56a0f78..fec9339954 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -40,6 +40,7 @@ module.exports = { "Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine", "Explore Algorithms/AI Services/Quickstart - Create Audiobooks", "Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs", + "Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU", "Explore Algorithms/AI Services/Quickstart - Flooding Risk", "Explore Algorithms/AI Services/Quickstart - Predictive Maintenance", ],