From 81a36bd2457ccca5a526071c1cc7d06cafa7f807 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Thu, 20 Jul 2023 16:20:52 +0100 Subject: [PATCH 01/23] docs: continue fixing broken links (#2026) --- .../Advanced Usage - Async, Batching, and Multi-Key.ipynb | 2 +- .../Anomaly Detection/Quickstart - Isolation Forests.ipynb | 2 +- .../Hyperparameter Tuning/HyperOpt.ipynb | 2 +- .../OpenAI/Quickstart - Understand and Search Forms.ipynb | 7 ++++--- website/src/pages/index.js | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb index 0fb643556c9..a49fc5841b5 100644 --- a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb +++ b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb @@ -393,7 +393,7 @@ "cell_type": "markdown", "source": [ "## Learn More\n", - "- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20Overview/)\n", + "- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/CognitiveServices%20-%20Overview/)\n", "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)" ], "metadata": { diff --git a/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb index 580fe839725..f2a359a224a 100644 --- a/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb +++ b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb @@ -24,7 +24,7 @@ "metadata": {}, "source": [ "## Prerequisites\n", - " - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](../../../Use%20with%20MLFLow/Overview/).\n" + " - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](../../../Use%20with%20MLFlow/Overview/).\n" ] }, { diff --git a/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb index 56fb0433d57..808f3c14884 100644 --- a/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb +++ b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb @@ -19,7 +19,7 @@ "* Running distributed training with SynapseML without hyperparameter tuning.\n", "* Using Hyperopt to tune hyperparameters in the distributed training workflow.\n", "## Prerequisites\n", - " - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](../../../Use%20with%20MLFLow/Overview/).\n", + " - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](../../../Use%20with%20MLFlow/Overview/).\n", "\n", "## Requirements\n", " - Install HyperOpt" diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb index e397a382b86..d5e91d28258 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb @@ -210,7 +210,7 @@ "source": [ "## 3 - Apply form recognition\n", "\n", - "This code loads the [AnalyzeInvoices transformer](https://microsoft.github.io/SynapseML/docs/documentation/transformers/transformers_cognitive/#analyzeinvoices) and passes a reference to the data frame containing the invoices. It calls the pre-built invoice model of Azure Forms Analyzer." + "This code loads the AnalyzeInvoices transformer and passes a reference to the data frame containing the invoices. It calls the pre-built invoice model of Azure Forms Analyzer." ] }, { @@ -226,7 +226,8 @@ "nuid": "c38db874-a1a5-49ae-913e-d55e3593c794", "showTitle": false, "title": "" - } + }, + "is_executing": true }, "outputs": [], "source": [ @@ -368,7 +369,7 @@ "source": [ "## 5 - Add translations\n", "\n", - "This code loads [Translate](https://microsoft.github.io/SynapseML/docs/documentation/transformers/transformers_cognitive/#translate), a transformer that calls the Azure Translator service in Cognitive Services. The original text, which is in English in the \"Description\" column, is machine-translated into various languages. All of the output is consolidated into \"output.translations\" array." + "This code loads Translate, a transformer that calls the Azure Translator service in Cognitive Services. The original text, which is in English in the \"Description\" column, is machine-translated into various languages. All of the output is consolidated into \"output.translations\" array." ] }, { diff --git a/website/src/pages/index.js b/website/src/pages/index.js index c5175f24158..4159858fc06 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -458,7 +458,7 @@ dotnet add package SynapseML.Vw --version 0.11.2`} lang="bash" > For detailed installation, please refer this{" "} - instruction. + instruction. From df2712afeb211003bd13e622239c9ae09c15615a Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Thu, 20 Jul 2023 17:25:27 +0100 Subject: [PATCH 02/23] docs: fix broken links (#2027) * docs: fix broken links * Update docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb --- ...age - Async, Batching, and Multi-Key.ipynb | 318 +++++++++--------- 1 file changed, 159 insertions(+), 159 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb index a49fc5841b5..7b64006026a 100644 --- a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb +++ b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb @@ -2,88 +2,99 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# Cognitive Services Advanced Guide: Asynchrony, Batching, Multi-Key" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "1a39046d-a692-44c3-b673-78dfc1f97e08", "inputWidgets": {}, + "nuid": "1a39046d-a692-44c3-b673-78dfc1f97e08", + "showTitle": false, "title": "" } - } + }, + "source": [ + "# Cognitive Services Advanced Guide: Asynchrony, Batching, Multi-Key" + ] }, { "cell_type": "markdown", - "source": [ - "## Step 1: Imports and Keys" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "0f8d5274-46d5-4604-be41-1f3f5d481d9a", "inputWidgets": {}, + "nuid": "0f8d5274-46d5-4604-be41-1f3f5d481d9a", + "showTitle": false, "title": "" } - } + }, + "source": [ + "## Step 1: Imports and Keys" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "84829dd7-0e7d-4ee3-aa9e-c3aa6ef96c8d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "from synapse.ml.core.platform import find_secret\n", "\n", "service_key = find_secret(\"cognitive-api-key\")\n", "service_loc = \"eastus\"" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "84829dd7-0e7d-4ee3-aa9e-c3aa6ef96c8d", "inputWidgets": {}, + "nuid": "8a49dfe2-f00d-4db5-95d5-f119fc09e2ee", + "showTitle": false, "title": "" } }, - "outputs": [], - "execution_count": 0 + "source": [ + "## Step 2: Basic Usage" + ] }, { "cell_type": "markdown", - "source": [ - "## Step 2: Basic Usage" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "8a49dfe2-f00d-4db5-95d5-f119fc09e2ee", "inputWidgets": {}, + "nuid": "93d1a1d0-96b5-48a2-9248-0d9facdae679", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "Image 1 | Image 2 | Image 3 \n", ":-------------------------:|:-------------------------:|:----------------------:|\n", "! | | " - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "93d1a1d0-96b5-48a2-9248-0d9facdae679", "inputWidgets": {}, + "nuid": "9e1933f3-06b3-4dfd-a6a2-30d33d7da845", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "from synapse.ml.cognitive.vision import AnalyzeImage\n", "\n", @@ -108,85 +119,83 @@ ")\n", "\n", "image_results = analyzer.transform(image_df).cache()" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "showTitle": false, - "cellMetadata": {}, - "nuid": "9e1933f3-06b3-4dfd-a6a2-30d33d7da845", - "inputWidgets": {}, - "title": "" - } - }, - "outputs": [], - "execution_count": 0 + ] }, { "cell_type": "markdown", - "source": [ - "#### First we'll look at the full response objects:" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "8f759bfa-4b88-4659-a535-d768ddee9e4f", "inputWidgets": {}, + "nuid": "8f759bfa-4b88-4659-a535-d768ddee9e4f", + "showTitle": false, "title": "" } - } + }, + "source": [ + "#### First we'll look at the full response objects:" + ] }, { "cell_type": "code", - "source": [ - "display(image_results)" - ], + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "9fae2ca5-f16a-460b-94ec-e433f24f7fb4", "inputWidgets": {}, + "nuid": "9fae2ca5-f16a-460b-94ec-e433f24f7fb4", + "showTitle": false, "title": "" } }, "outputs": [], - "execution_count": 0 + "source": [ + "display(image_results)" + ] }, { "cell_type": "markdown", - "source": [ - "#### We can select out just what we need:" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "7b08b439-a505-4de3-a71e-63af30453163", "inputWidgets": {}, + "nuid": "7b08b439-a505-4de3-a71e-63af30453163", + "showTitle": false, "title": "" } - } + }, + "source": [ + "#### We can select out just what we need:" + ] }, { "cell_type": "code", - "source": [ - "display(image_results.select(\"analysis_results.description.captions.text\"))" - ], + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "88e738a6-f1bf-4077-8436-984aac858b1b", "inputWidgets": {}, + "nuid": "88e738a6-f1bf-4077-8436-984aac858b1b", + "showTitle": false, "title": "" } }, "outputs": [], - "execution_count": 0 + "source": [ + "display(image_results.select(\"analysis_results.description.captions.text\"))" + ] }, { "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c6c2fd12-8c26-4f96-b0a5-0c55c04c182a", + "showTitle": false, + "title": "" + } + }, "source": [ "#### What's going on under the hood\n", "\n", @@ -194,99 +203,101 @@ "\n", "When we call the cognitive service transformer, we start cognitive service clients on each of your spark workers.\n", "These clients send requests to the cloud, and turn the JSON responses into Spark Struct Types so that you can access any field that the service returns." - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "c6c2fd12-8c26-4f96-b0a5-0c55c04c182a", "inputWidgets": {}, + "nuid": "31618622-57db-4973-8ab8-1bab6d7efd2e", + "showTitle": false, "title": "" } - } + }, + "source": [ + "## Step 3: Asynchronous Usage" + ] }, { "cell_type": "markdown", - "source": [ - "## Step 3: Asynchronous Usage" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "31618622-57db-4973-8ab8-1bab6d7efd2e", "inputWidgets": {}, + "nuid": "8e7e5ace-71c2-4170-8b5d-350297b907db", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "\n", "\n", "Apache Spark ordinarily parallelizes a computation to all of it's worker threads. When working with services however this parallelism doesent fully maximize throughput because workers sit idle as requests are processed on the server. The `concurrency` parameter makes sure that each worker can stay busy as they wait for requests to complete." - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "showTitle": false, - "cellMetadata": {}, - "nuid": "8e7e5ace-71c2-4170-8b5d-350297b907db", - "inputWidgets": {}, - "title": "" - } - } + ] }, { "cell_type": "code", - "source": [ - "display(analyzer.setConcurrency(3).transform(image_df))" - ], + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "f874a63e-f22e-4c6f-9d54-83f93d140721", "inputWidgets": {}, + "nuid": "f874a63e-f22e-4c6f-9d54-83f93d140721", + "showTitle": false, "title": "" } }, "outputs": [], - "execution_count": 0 + "source": [ + "display(analyzer.setConcurrency(3).transform(image_df))" + ] }, { "cell_type": "markdown", - "source": [ - "#### Faster without extra hardware:\n", - "" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "f82c9d17-77db-44fa-8d1c-b0b7905c0e31", "inputWidgets": {}, + "nuid": "f82c9d17-77db-44fa-8d1c-b0b7905c0e31", + "showTitle": false, "title": "" } - } + }, + "source": [ + "#### Faster without extra hardware:\n", + "" + ] }, { "cell_type": "markdown", - "source": [ - "## Step 4: Batching" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "d54b3f5e-8d44-486f-97a3-0b8528934e73", "inputWidgets": {}, + "nuid": "d54b3f5e-8d44-486f-97a3-0b8528934e73", + "showTitle": false, "title": "" } - } + }, + "source": [ + "## Step 4: Batching" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c3092f7b-105b-4171-9649-f04b189d76a0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "from synapse.ml.cognitive.text import TextSentiment\n", "\n", @@ -312,36 +323,36 @@ "\n", "# Show the results of your text query\n", "display(sentiment.transform(text_df).select(\"text\", \"sentiment.document.sentiment\"))" - ], + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "c3092f7b-105b-4171-9649-f04b189d76a0", "inputWidgets": {}, + "nuid": "ee4a9f18-d845-4059-9edd-9bd625a75a1a", + "showTitle": false, "title": "" } }, - "outputs": [], - "execution_count": 0 - }, - { - "cell_type": "markdown", "source": [ "## Step 5: Multi-Key" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "ee4a9f18-d845-4059-9edd-9bd625a75a1a", "inputWidgets": {}, + "nuid": "a6f89d8b-7cd1-42be-8310-62989c80deb2", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "from synapse.ml.cognitive.text import TextSentiment\n", "from pyspark.sql.functions import udf\n", @@ -359,71 +370,60 @@ "image_df2 = image_df.withColumn(\"key\", random_key())\n", "\n", "results = analyzer.setSubscriptionKeyCol(\"key\").transform(image_df2)" - ], - "metadata": { - "application/vnd.databricks.v1+cell": { - "showTitle": false, - "cellMetadata": {}, - "nuid": "a6f89d8b-7cd1-42be-8310-62989c80deb2", - "inputWidgets": {}, - "title": "" - } - }, - "outputs": [], - "execution_count": 0 + ] }, { "cell_type": "code", - "source": [ - "display(results.select(\"key\", \"analysis_results.description.captions.text\"))" - ], + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "c2f0ff6f-688e-4ca0-88eb-9eb8bda66786", "inputWidgets": {}, + "nuid": "c2f0ff6f-688e-4ca0-88eb-9eb8bda66786", + "showTitle": false, "title": "" } }, "outputs": [], - "execution_count": 0 + "source": [ + "display(results.select(\"key\", \"analysis_results.description.captions.text\"))" + ] }, { "cell_type": "markdown", - "source": [ - "## Learn More\n", - "- [Explore other cogntive services](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/CognitiveServices%20-%20Overview/)\n", - "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "1ed7401d-28f7-4133-93e3-08e145772502", "inputWidgets": {}, + "nuid": "1ed7401d-28f7-4133-93e3-08e145772502", + "showTitle": false, "title": "" } - } + }, + "source": [ + "## Learn More\n", + "- [Explore other cogntive services](./Overview)\n", + "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)" + ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { - "notebookName": "CognitiveServices - Advanced Usage: Async, Batching, and Multi-Key", "dashboards": [], + "language": "python", "notebookMetadata": { "pythonIndentUnit": 2 }, - "language": "python", - "widgets": {}, - "notebookOrigID": 3743502060540796 + "notebookName": "CognitiveServices - Advanced Usage: Async, Batching, and Multi-Key", + "notebookOrigID": 3743502060540796, + "widgets": {} }, "kernelspec": { - "name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From d9b5d03e60f4e40323cace5a947732f43fa4454c Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Mon, 24 Jul 2023 14:41:03 +0100 Subject: [PATCH 03/23] docs: fix broken link (#2032) --- .../Advanced Usage - Async, Batching, and Multi-Key.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb index 7b64006026a..f0a7b158fae 100644 --- a/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb +++ b/docs/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.ipynb @@ -402,7 +402,7 @@ }, "source": [ "## Learn More\n", - "- [Explore other cogntive services](./Overview)\n", + "- [Explore other cogntive services](../Overview)\n", "- [Read our paper \"Large-Scale Intelligent Microservices\"](https://arxiv.org/abs/2009.08044)" ] } From 4841ddb70cae5e6b8cf76d77bd2fed9d486b5471 Mon Sep 17 00:00:00 2001 From: aydan-at-microsoft <51974608+aydan-at-microsoft@users.noreply.github.com> Date: Mon, 24 Jul 2023 07:56:16 -0700 Subject: [PATCH 04/23] docs: add QandA notebook. (#2029) * add QandA notebook. Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari * address comments. Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari * rename cog services to ai services * add test to exclude, wait for synapse fix --------- Co-authored-by: Kartavya Neema Co-authored-by: Amir Jafari Co-authored-by: Mark Hamilton --- .../synapse/ml/nbtest/SynapseTests.scala | 5 +- ...ent Question and Answering with PDFs.ipynb | 1016 +++++++++++++++++ website/sidebars.js | 1 + 3 files changed, 1020 insertions(+), 2 deletions(-) create mode 100644 docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala index d7916b8eafc..195cd978001 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseTests.scala @@ -46,8 +46,9 @@ class SynapseTests extends TestBase { .filter(_.getAbsolutePath.endsWith(".py")) .filterNot(_.getAbsolutePath.contains("Finetune")) // Excluded by design task 1829306 .filterNot(_.getAbsolutePath.contains("VWnativeFormat")) - .filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synpase fix - .filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synpase fix + .filterNot(_.getAbsolutePath.contains("VowpalWabbitMulticlassclassification")) // Wait for Synapse fix + .filterNot(_.getAbsolutePath.contains("Langchain")) // Wait for Synapse fix + .filterNot(_.getAbsolutePath.contains("DocumentQuestionandAnsweringwithPDFs")) // Wait for Synapse fix .filterNot(_.getAbsolutePath.contains("SetupCognitive")) // No code to run .filterNot(_.getAbsolutePath.contains("CreateaSparkCluster")) // No code to run .sortBy(_.getAbsolutePath) diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb new file mode 100644 index 00000000000..e8e4a938e23 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -0,0 +1,1016 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A on PDF Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework for any form of a document using [OpenAI models](https://azure.microsoft.com/en-us/products/ai-services/openai-service), [SynapseML](https://microsoft.github.io/SynapseML/docs/about/) and [Azure AI Services](https://azure.microsoft.com/en-us/products/cognitive-services/). In this notebook, we assume that PDF documents are the source of data, however, the same framework can be easiy extended to other document formats too. \n", + "\n", + "We’ll cover the following key steps:\n", + "\n", + "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Form Recognizer Service](https://azure.microsoft.com/en-us/products/form-recognizer/) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", + "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the user’s question.\n", + "3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7c8afaa-8298-4d48-9db0-867b6307963a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We start by installing the necessary python libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install langchain openai" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "be4f7d31-48e0-4d71-af5c-645883891567", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 1: Provide the keys for Azure AI Services and Azure OpenAI to authenticate the applications." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "87b58b64-49a4-4a78-a915-7c2478c22c7d", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "To authenticate Azure AI Services and Azure OpenAI applications, you need to provide the respective API keys. Here is an example of how you can provide the keys in Python code. `find_secret()` function uses Azure Keyvault to get the API keys, however you can directly paste your own keys there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from synapse.ml.core.platform import find_secret\n", + "\n", + "ai_services_key = find_secret(\"cognitive-api-key\")\n", + "ai_services_location = \"eastus\"\n", + "\n", + "# Fill in the following lines with your Azure service information\n", + "aoai_service_name = \"synapseml-openai\"\n", + "aoai_endpoint = f\"https://{aoai_service_name}.openai.azure.com/\"\n", + "aoai_key = find_secret(\"openai-api-key\")\n", + "aoai_deployment_name_embeddings = \"text-embedding-ada-002\"\n", + "aoai_deployment_name_query = \"text-davinci-003\"\n", + "aoai_model_name_query = \"text-davinci-003\"\n", + "\n", + "# Azure Cognitive Search\n", + "cogsearch_name = \"mmlspark-azure-search\"\n", + "cogsearch_index_name = \"exampleindex\"\n", + "cogsearch_api_key = find_secret(\"azure-search-key\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the PDF documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "\n", + "document_path = \"wasbs://public@synapseaisolutionsa.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).limit(10).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "189a84ca-ac81-4130-9143-75883b2633ba", + "showTitle": false, + "title": "" + } + }, + "source": [ + "This code will read the PDF documents and create a Spark DataFrame named df with the contents of the PDFs. The DataFrame will have a schema that represents the structure of the PDF documents, including their textual content." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "90f708b9-9ef2-4de5-b555-a2aa32fd0cfc", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Let's take a glimpse at the contents of the e-books we are working with. Below are some screenshots that showcase the essence of the books; as you can see they contain information about the Earth.\n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8119ea95-aa60-4f81-8189-04009fb4aac0", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "##### Display the raw data from the PDF documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the dataframe without the content\n", + "display(df.drop(\"content\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the documents using Azure AI Services Form Recognizer." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/en-us/products/ai-services).\n", + "\n", + "Additionally, we employ AnalyzeDocument from Azure AI Services to extract the complete document content and present it in the designated columns called \"output_content\" and \"paragraph.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.cognitive import AnalyzeDocument\n", + "from pyspark.sql.functions import col\n", + "\n", + "analyzeDocument = (\n", + " AnalyzeDocument()\n", + " .setPrebuiltModelId(\"prebuilt-layout\")\n", + " .setSubscriptionKey(ai_services_key)\n", + " .setLocation(ai_services_location)\n", + " .setImageBytesCol(\"content\")\n", + " .setOutputCol(\"result\")\n", + " .setPages(\n", + " \"1-15\"\n", + " ) # Here we are reading the first 15 pages of the documents for demo purposes\n", + ")\n", + "\n", + "analyzed_df = (\n", + " analyzeDocument.transform(df)\n", + " .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n", + " .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n", + ").cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can observe the analayzed Spark DataFrame named ```analyzed_df``` using the following code. Note that we drop the \"content\" column as it is not needed anymore." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_df = analyzed_df.drop(\"content\")\n", + "display(analyzed_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "59188b7a-32fa-406d-8562-09ad69400b28", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Split the documents into chunks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d682af37-faa8-4830-acd0-96aa348815d3", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "After analyzing the document, we leverage SynapseML’s PageSplitter to divide the documents into smaller sections, which are subsequently stored in the “chunks” column. This allows for more granular representation and processing of the document content." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.featurize.text import PageSplitter\n", + "\n", + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)\n", + "display(splitted_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ce75e0fc-c036-488f-acba-57a44924d55e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "Note that the chunks for each document are presented in a single row inside an array. In order to embed all the chunks in the following cells, we need to have each chunk in a separate row. To accomplish that, we first explode these arrays so there is only one chunk in each row, then filter the Spark DataFrame in order to only keep the path to the document and the chunk in a single row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "from pyspark.sql.functions import explode, col\n", + "\n", + "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\n", + " \"path\", \"chunk\"\n", + ")\n", + "display(exploded_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 5: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=console)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from synapse.ml.cognitive import OpenAIEmbedding\n", + "\n", + "embedding = (\n", + " OpenAIEmbedding()\n", + " .setSubscriptionKey(aoai_key)\n", + " .setDeploymentName(aoai_deployment_name_embeddings)\n", + " .setCustomServiceName(aoai_service_name)\n", + " .setTextCol(\"chunk\")\n", + " .setErrorCol(\"error\")\n", + " .setOutputCol(\"embeddings\")\n", + ")\n", + "\n", + "df_embeddings = embedding.transform(exploded_df)\n", + "\n", + "display(df_embeddings)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e7d8e559-92bb-44bc-aee0-93b2490f38e2", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Store the embeddings in Azure Cognitive Search Vector Store." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6d3aaa47-818c-4eb2-b131-8d316380a0ab", + "showTitle": false, + "title": "" + } + }, + "source": [ + "[Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) offers a user-friendly interface for creating a vector database, as well as storing and retrieving data using vector search. If you're interested in learning more about vector search, you can look [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).\n", + "\n", + "\n", + "Storing data in the AzureCogSearch vector database involves two main steps:\n", + "\n", + "Creating the Index: The first step is to establish the index or schema of the vector database. This entails defining the structure and properties of the data that will be stored and indexed in the vector database.\n", + "\n", + "Adding Chunked Documents and Embeddings: The second step involves adding the chunked documents, along with their corresponding embeddings, to the vector datastore. This allows for efficient storage and retrieval of the data using vector search capabilities.\n", + "\n", + "By following these steps, you can effectively store your chunked documents and their associated embeddings in the AzureCogSearch vector database, enabling seamless retrieval of relevant information through vector search functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary packages\n", + "import requests\n", + "import json\n", + "\n", + "EMBEDDING_LENGTH = (\n", + " 1536 # length of the embedding vector (OpenAI generates embeddings of length 1536)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Index for Cog Search with fields as id, content, and contentVector\n", + "# Note the datatypes for each field below\n", + "\n", + "url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}?api-version=2023-07-01-Preview\"\n", + "payload = json.dumps(\n", + " {\n", + " \"name\": cogsearch_index_name,\n", + " \"fields\": [\n", + " {\"name\": \"id\", \"type\": \"Edm.String\", \"key\": True, \"filterable\": True},\n", + " {\n", + " \"name\": \"content\",\n", + " \"type\": \"Edm.String\",\n", + " \"searchable\": True,\n", + " \"retrievable\": True,\n", + " },\n", + " {\n", + " \"name\": \"contentVector\",\n", + " \"type\": \"Collection(Edm.Single)\",\n", + " \"searchable\": True,\n", + " \"retrievable\": True,\n", + " \"dimensions\": EMBEDDING_LENGTH,\n", + " \"vectorSearchConfiguration\": \"vectorConfig\",\n", + " },\n", + " ],\n", + " \"vectorSearch\": {\n", + " \"algorithmConfigurations\": [\n", + " {\n", + " \"name\": \"vectorConfig\",\n", + " \"kind\": \"hnsw\",\n", + " }\n", + " ]\n", + " },\n", + " }\n", + ")\n", + "headers = {\"Content-Type\": \"application/json\", \"api-key\": cogsearch_api_key}\n", + "\n", + "response = requests.request(\"PUT\", url, headers=headers, data=payload)\n", + "print(response.status_code)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "07396763-74c3-4299-8976-e15e6d510d47", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We need to use User Defined Function (UDF) through the udf() method in order to apply functions directly to the DataFrames and SQL databases in Python, without any need to individually register them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use Spark's UDF to insert entries to Cognitive Search\n", + "# This allows to run the code in a distributed fashion\n", + "\n", + "# Define a UDF using the @udf decorator\n", + "@udf(returnType=StringType())\n", + "def insertToCogSearch(idx, content, contentVector):\n", + " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n", + "\n", + " payload = json.dumps(\n", + " {\n", + " \"value\": [\n", + " {\n", + " \"id\": str(idx),\n", + " \"content\": content,\n", + " \"contentVector\": contentVector.tolist(),\n", + " \"@search.action\": \"upload\",\n", + " },\n", + " ]\n", + " }\n", + " )\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"api-key\": cogsearch_api_key,\n", + " }\n", + "\n", + " response = requests.request(\"POST\", url, headers=headers, data=payload)\n", + " # response.text\n", + "\n", + " if response.status_code == 200 or response.status_code == 201:\n", + " return \"Success\"\n", + " else:\n", + " return \"Failure\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "42688e00-98fb-406e-9f19-c89fed3248ef", + "showTitle": false, + "title": "" + } + }, + "source": [ + "In the following, we apply UDF to different columns. Note that UDF also helps to add new columns to the DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply the UDF on the different columns\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "\n", + "df_embeddings = df_embeddings.withColumn(\n", + " \"idx\", monotonically_increasing_id()\n", + ") ## adding a column with id\n", + "df_embeddings = df_embeddings.withColumn(\n", + " \"errorCogSearch\",\n", + " insertToCogSearch(\n", + " df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n", + " ),\n", + ")\n", + "\n", + "# Show the transformed DataFrame\n", + "df_embeddings.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 7: Ask a Question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "749a6ec7-d6c9-4945-bc72-2deed94e712b", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "userQuestion = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "retrieve_k = 2 # Retrieve the top 2 documents from vector database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question and convert to embeddings\n", + "\n", + "\n", + "def genQuestionEmbedding(userQuestion):\n", + " # Convert question to embedding using synapseML\n", + " from synapse.ml.cognitive import OpenAIEmbedding\n", + "\n", + " df_ques = spark.createDataFrame([(userQuestion, 1)], [\"questions\", \"dummy\"])\n", + " embedding = (\n", + " OpenAIEmbedding()\n", + " .setSubscriptionKey(aoai_key)\n", + " .setDeploymentName(aoai_deployment_name_embeddings)\n", + " .setCustomServiceName(aoai_service_name)\n", + " .setTextCol(\"questions\")\n", + " .setErrorCol(\"errorQ\")\n", + " .setOutputCol(\"embeddings\")\n", + " )\n", + " df_ques_embeddings = embedding.transform(df_ques)\n", + " row = df_ques_embeddings.collect()[0]\n", + " questionEmbedding = row.embeddings.tolist()\n", + " return questionEmbedding\n", + "\n", + "\n", + "def retrieve_k_chunk(k, questionEmbedding):\n", + " # Retrieve the top K entries\n", + " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n", + "\n", + " payload = json.dumps(\n", + " {\"vector\": {\"value\": questionEmbedding, \"fields\": \"contentVector\", \"k\": 2}}\n", + " )\n", + " headers = {\n", + " \"Content-Type\": \"application/json\",\n", + " \"api-key\": cogsearch_api_key,\n", + " }\n", + "\n", + " response = requests.request(\"POST\", url, headers=headers, data=payload)\n", + " output = json.loads(response.text)\n", + " print(response.status_code)\n", + " return output\n", + "\n", + "\n", + "# Generate embeddings for the question and retrieve the top k document chunks\n", + "questionEmbedding = genQuestionEmbedding(userQuestion)\n", + "output = retrieve_k_chunk(retrieve_k, questionEmbedding)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06778fa1-303f-4a3b-814b-c0375df855c2", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "To provide a response to the user's question, we will utilize the [LangChain](https://python.langchain.com/en/latest/index.html) framework. With the LangChain framework we will augment the retrieved documents with respect to the user's question. Following this, we can request a response to the user's question from our framework." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necenssary libraries and setting up OpenAI\n", + "from langchain.llms import AzureOpenAI\n", + "from langchain import PromptTemplate\n", + "from langchain.chains import LLMChain\n", + "import openai\n", + "\n", + "openai.api_type = \"azure\"\n", + "openai.api_base = aoai_endpoint\n", + "openai.api_version = \"2022-12-01\"\n", + "openai.api_key = aoai_key" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a Question Answering chain function using LangChain\n", + "def QA_chain_func():\n", + "\n", + " # Define llm model\n", + " llm = AzureOpenAI(\n", + " deployment_name=aoai_deployment_name_query,\n", + " model_name=aoai_model_name_query,\n", + " openai_api_key=aoai_key,\n", + " openai_api_version=\"2022-12-01\",\n", + " )\n", + "\n", + " # Write a preprompt with context and query as variables\n", + " template = \"\"\"\n", + " context :{context}\n", + " Answer the question based on the context above. If the\n", + " information to answer the question is not present in the given context then reply \"I don't know\".\n", + " Question: {query}\n", + " Answer: \"\"\"\n", + "\n", + " # Define a prompt template\n", + " prompt_template = PromptTemplate(\n", + " input_variables=[\"context\", \"query\"], template=template\n", + " )\n", + " # Define a chain\n", + " qa_chain = LLMChain(llm=llm, prompt=prompt_template)\n", + " return qa_chain\n", + "\n", + "\n", + "# Concatenate the content of retrieved documents\n", + "context = [i[\"content\"] for i in output[\"value\"]]\n", + "\n", + "# Make a Quesion Answer chain function and pass\n", + "qa_chain = QA_chain_func()\n", + "answer = qa_chain.run({\"context\": context, \"query\": userQuestion})\n", + "\n", + "print(answer)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/website/sidebars.js b/website/sidebars.js index 22bbbab2434..c3ea2476817 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -39,6 +39,7 @@ module.exports = { "Explore Algorithms/AI Services/Quickstart - Analyze Text", "Explore Algorithms/AI Services/Quickstart - Creare a Visual Search Engine", "Explore Algorithms/AI Services/Quickstart - Create Audiobooks", + "Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs", "Explore Algorithms/AI Services/Quickstart - Flooding Risk", "Explore Algorithms/AI Services/Quickstart - Predictive Maintenance", ], From 0d8e613bc6e79f81795ab2b6f7cb9931cf66a864 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 17:01:14 +0100 Subject: [PATCH 05/23] build: bump actions/checkout from 2 to 3 (#2030) Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Mark Hamilton --- .github/workflows/check-dead-links.yml | 2 +- .github/workflows/scorecards.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-dead-links.yml b/.github/workflows/check-dead-links.yml index 97eaf6fb58e..faaff7be8bd 100644 --- a/.github/workflows/check-dead-links.yml +++ b/.github/workflows/check-dead-links.yml @@ -15,7 +15,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install dependencies run: | diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 9aa466912d5..d641547701e 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -32,7 +32,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + uses: actions/checkout@v3 # v3.1.0 with: persist-credentials: false From 495c9a9f297a31f05645324014e1ae738336d7eb Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Mon, 24 Jul 2023 18:49:06 +0100 Subject: [PATCH 06/23] chore: remove build exclusions from pipeline.yaml --- pipeline.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipeline.yaml b/pipeline.yaml index a6d93ae801c..b3d5886e87e 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -10,9 +10,7 @@ trigger: - README.md - CONTRIBUTORS.md - SECURITY.md - - docs/* - CODEOWNERS - - .github pr: branches: From 072c9c9631d2c8c498677356dd2c152efd3cf51c Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Mon, 24 Jul 2023 18:50:55 +0100 Subject: [PATCH 07/23] chore: remove exclusions from pipeline.yml --- pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline.yaml b/pipeline.yaml index b3d5886e87e..83c24f60ffe 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -19,9 +19,9 @@ pr: paths: exclude: - README.md - - docs/* + - CONTRIBUTORS.md + - SECURITY.md - CODEOWNERS - - .github schedules: - cron: "0 0 * * *" From 8be8fe3e61837dbe96a28189fd436e0b684f8ed2 Mon Sep 17 00:00:00 2001 From: aydan-at-microsoft <51974608+aydan-at-microsoft@users.noreply.github.com> Date: Mon, 24 Jul 2023 13:58:51 -0700 Subject: [PATCH 08/23] docs: fix variable formatting for QandA nb (#2033) --- ...ent Question and Answering with PDFs.ipynb | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb index e8e4a938e23..b65689c4b1d 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -48,7 +48,7 @@ "\n", "We’ll cover the following key steps:\n", "\n", - "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Form Recognizer Service](https://azure.microsoft.com/en-us/products/form-recognizer/) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", + "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the user’s question.\n", "3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)." ] @@ -298,7 +298,7 @@ } }, "source": [ - "### Step 3: Read the documents using Azure AI Services Form Recognizer." + "### Step 3: Read the documents using Azure AI Document Intelligence." ] }, { @@ -335,7 +335,7 @@ "from synapse.ml.cognitive import AnalyzeDocument\n", "from pyspark.sql.functions import col\n", "\n", - "analyzeDocument = (\n", + "analyze_document = (\n", " AnalyzeDocument()\n", " .setPrebuiltModelId(\"prebuilt-layout\")\n", " .setSubscriptionKey(ai_services_key)\n", @@ -348,7 +348,7 @@ ")\n", "\n", "analyzed_df = (\n", - " analyzeDocument.transform(df)\n", + " analyze_document.transform(df)\n", " .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n", " .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n", ").cache()" @@ -701,7 +701,7 @@ "\n", "# Define a UDF using the @udf decorator\n", "@udf(returnType=StringType())\n", - "def insertToCogSearch(idx, content, contentVector):\n", + "def insert_to_cog_search(idx, content, contentVector):\n", " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n", "\n", " payload = json.dumps(\n", @@ -762,7 +762,7 @@ ") ## adding a column with id\n", "df_embeddings = df_embeddings.withColumn(\n", " \"errorCogSearch\",\n", - " insertToCogSearch(\n", + " insert_to_cog_search(\n", " df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n", " ),\n", ")\n", @@ -791,7 +791,7 @@ } }, "source": [ - "### Step 7: Ask a Question" + "### Step 7: Ask a Question." ] }, { @@ -823,7 +823,7 @@ "metadata": {}, "outputs": [], "source": [ - "userQuestion = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", "retrieve_k = 2 # Retrieve the top 2 documents from vector database" ] }, @@ -836,11 +836,11 @@ "# Ask a question and convert to embeddings\n", "\n", "\n", - "def genQuestionEmbedding(userQuestion):\n", + "def gen_question_embedding(user_question):\n", " # Convert question to embedding using synapseML\n", " from synapse.ml.cognitive import OpenAIEmbedding\n", "\n", - " df_ques = spark.createDataFrame([(userQuestion, 1)], [\"questions\", \"dummy\"])\n", + " df_ques = spark.createDataFrame([(user_question, 1)], [\"questions\", \"dummy\"])\n", " embedding = (\n", " OpenAIEmbedding()\n", " .setSubscriptionKey(aoai_key)\n", @@ -852,16 +852,16 @@ " )\n", " df_ques_embeddings = embedding.transform(df_ques)\n", " row = df_ques_embeddings.collect()[0]\n", - " questionEmbedding = row.embeddings.tolist()\n", - " return questionEmbedding\n", + " question_embedding = row.embeddings.tolist()\n", + " return question_embedding\n", "\n", "\n", - "def retrieve_k_chunk(k, questionEmbedding):\n", + "def retrieve_k_chunk(k, question_embedding):\n", " # Retrieve the top K entries\n", " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n", "\n", " payload = json.dumps(\n", - " {\"vector\": {\"value\": questionEmbedding, \"fields\": \"contentVector\", \"k\": 2}}\n", + " {\"vector\": {\"value\": question_embedding, \"fields\": \"contentVector\", \"k\": 2}}\n", " )\n", " headers = {\n", " \"Content-Type\": \"application/json\",\n", @@ -875,8 +875,8 @@ "\n", "\n", "# Generate embeddings for the question and retrieve the top k document chunks\n", - "questionEmbedding = genQuestionEmbedding(userQuestion)\n", - "output = retrieve_k_chunk(retrieve_k, questionEmbedding)" + "question_embedding = gen_question_embedding(user_question)\n", + "output = retrieve_k_chunk(retrieve_k, question_embedding)" ] }, { @@ -899,7 +899,7 @@ } }, "source": [ - "### Step 8: Respond to a User’s Question" + "### Step 8: Respond to a User’s Question." ] }, { @@ -968,7 +968,7 @@ "outputs": [], "source": [ "# Define a Question Answering chain function using LangChain\n", - "def QA_chain_func():\n", + "def qa_chain_func():\n", "\n", " # Define llm model\n", " llm = AzureOpenAI(\n", @@ -999,8 +999,8 @@ "context = [i[\"content\"] for i in output[\"value\"]]\n", "\n", "# Make a Quesion Answer chain function and pass\n", - "qa_chain = QA_chain_func()\n", - "answer = qa_chain.run({\"context\": context, \"query\": userQuestion})\n", + "qa_chain = qa_chain_func()\n", + "answer = qa_chain.run({\"context\": context, \"query\": user_question})\n", "\n", "print(answer)" ] From db6386c6d6a133eb55fbec640012ad82bb526c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Fri, 28 Jul 2023 01:56:39 +0200 Subject: [PATCH 09/23] fix: Fix ONNX link (#2035) --- docs/Explore Algorithms/Deep Learning/ONNX.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Explore Algorithms/Deep Learning/ONNX.md b/docs/Explore Algorithms/Deep Learning/ONNX.md index 8f844caaba9..5d45e386794 100644 --- a/docs/Explore Algorithms/Deep Learning/ONNX.md +++ b/docs/Explore Algorithms/Deep Learning/ONNX.md @@ -71,7 +71,7 @@ available models, optionally filtering by name or tags. | softMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to `Map("rawPrediction" -> "probability")` to obtain the probability outputs. | None | | argMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label. | None | | deviceType | Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used. | None | - | optimizationLevel | Specify the [optimization level](https://onnxruntime.ai/docs/resources/graph-optimizations.html#graph-optimization-levels) for the ONNX graph optimizations. Supported values are: `NO_OPT`, `BASIC_OPT`, `EXTENDED_OPT`, `ALL_OPT`. | `ALL_OPT` | + | optimizationLevel | Specify the [optimization level](https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#graph-optimization-levels) for the ONNX graph optimizations. Supported values are: `NO_OPT`, `BASIC_OPT`, `EXTENDED_OPT`, `ALL_OPT`. | `ALL_OPT` | 4. Call `transform` method to run inference on the input DataFrame. From cde68347a44b6556ed4e6f6ab57bf6b4968cc6b2 Mon Sep 17 00:00:00 2001 From: Scott Votaw Date: Wed, 2 Aug 2023 08:55:27 -0700 Subject: [PATCH 10/23] fix: Improve LGBM exception and logging (#2037) * Improve LGBM exception and logging * added log --- .../azure/synapse/ml/lightgbm/NetworkManager.scala | 13 ++++++++----- .../ml/lightgbm/StreamingPartitionTask.scala | 10 +--------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala index 197a8e42083..4644d49e2b5 100644 --- a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala +++ b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/NetworkManager.scala @@ -163,22 +163,25 @@ object NetworkManager { // and a list of partition ids in this executor. val lightGbmMachineList = driverInput.readLine() val partitionsByExecutorStr = driverInput.readLine() - val executorPartitionIds: Array[Int] = - parseExecutorPartitionList(partitionsByExecutorStr, taskStatus.executorId) - log.info(s"task $taskId, partition $partitionId received nodes for network init: '$lightGbmMachineList'") log.info(s"task $taskId, partition $partitionId received partition topology: '$partitionsByExecutorStr'") + log.info(s"task $taskId, partition $partitionId received nodes for network init: '$lightGbmMachineList'") + val executorPartitionIds: Array[Int] = + parseExecutorPartitionList(partitionsByExecutorStr, taskStatus.executorId, log) NetworkTopologyInfo(lightGbmMachineList, executorPartitionIds, localListenPort) }.get }.get } - private def parseExecutorPartitionList(partitionsByExecutorStr: String, executorId: String): Array[Int] = { + private def parseExecutorPartitionList(partitionsByExecutorStr: String, + executorId: String, + log: Logger): Array[Int] = { // extract this executors partition ids as an array, from a string that is formatter like this: // executor1=partition1,partition2:executor2=partition3,partition4 val partitionsByExecutor = partitionsByExecutorStr.split(":") val executorListStr = partitionsByExecutor.find(line => line.startsWith(executorId + "=")) if (executorListStr.isEmpty) - throw new Exception(s"Could not find partitions for executor $executorListStr. List: $partitionsByExecutorStr") + throw new Exception(s"Could not find partitions for executor $executorId. List: $partitionsByExecutorStr") + log.info(s"executor $executorId received partitions: '$executorListStr'") val partitionList = executorListStr.get.split("=")(1) partitionList.split(",").map(str => str.toInt).sorted } diff --git a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala index f04517f6e7e..c0b7b7812c1 100644 --- a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala +++ b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala @@ -174,15 +174,12 @@ class StreamingPartitionTask extends BasePartitionTask { val partitionRowCount = ctx.trainingCtx.partitionCounts.get(ctx.partitionId).toInt val partitionRowOffset = ctx.streamingPartitionOffset val isSparse = ctx.sharedState.isSparse.get - log.info(s"Inserting rows into training Dataset from partition ${ctx.partitionId}, " + + log.debug(s"Inserting rows into training Dataset from partition ${ctx.partitionId}, " + s"size $partitionRowCount, offset: $partitionRowOffset, sparse: $isSparse, threadId: ${ctx.threadIndex}") val dataset = ctx.sharedState.datasetState.streamingDataset.get val stopIndex = partitionRowOffset + partitionRowCount insertRowsIntoDataset(ctx, dataset, inputRows, partitionRowOffset, stopIndex, ctx.threadIndex) - - log.info(s"Part ${ctx.partitionId}: inserted $partitionRowCount partition ${ctx.partitionId} " + - s"rows into shared training dataset at offset $partitionRowOffset") } private def insertRowsIntoDataset(ctx: PartitionTaskContext, @@ -213,9 +210,7 @@ class StreamingPartitionTask extends BasePartitionTask { if (maxBatchSize == 0) 0 else loadOneDenseMicroBatchBuffer(state, inputRows, 0, maxBatchSize) if (count > 0) { - log.info(s"Part ${state.ctx.partitionId}: Pushing $count dense rows at $startIndex, will stop at $stopIndex") if (state.hasInitialScores && state.microBatchSize != count && state.numInitScoreClasses > 1) { - log.info(s"Part ${state.ctx.partitionId}: Adjusting $count initial scores") (1 until state.numInitScoreClasses).foreach { i => (0 until count).foreach { j => { val score = state.initScoreBuffer.getItem(i * state.microBatchSize + j) @@ -253,7 +248,6 @@ class StreamingPartitionTask extends BasePartitionTask { if (microBatchRowCount > 0) { // If we have only a partial micro-batch, and we have multi-class initial scores (i.e. numClass > 1), // we need to re-coalesce the data since it was stored column-wise based on original microBatchSize - log.info(s"Part ${state.ctx.partitionId}: Pushing $microBatchRowCount sparse rows at $startIndex") if (state.hasInitialScores && state.microBatchSize != microBatchRowCount && state.numInitScoreClasses > 1) { (1 until state.numInitScoreClasses).foreach { i => // TODO make this shared (0 until microBatchRowCount).foreach { j => { @@ -279,8 +273,6 @@ class StreamingPartitionTask extends BasePartitionTask { // might be more rows, so continue with tail recursion at next index pushSparseMicroBatches(state, inputRows, startIndex + microBatchRowCount, stopIndex) - } else { - log.info(s"LightGBM pushed $startIndex in partition ${state.ctx.partitionId}") } } From fa497f09b58a462a4ca47c14cbe4bfd12231f1b2 Mon Sep 17 00:00:00 2001 From: JessicaXYWang <108437381+JessicaXYWang@users.noreply.github.com> Date: Fri, 4 Aug 2023 05:55:41 -0700 Subject: [PATCH 11/23] docs: fix broken links (#2042) * fix broken links * fix broken links --- README.md | 8 ++++---- ...Document Question and Answering with PDFs.ipynb | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4c805c9bf8e..098b4ead27a 100644 --- a/README.md +++ b/README.md @@ -238,25 +238,25 @@ docker run -it -p 8888:8888 -e ACCEPT_EULA=yes mcr.microsoft.com/mmlspark/releas ``` Navigate to in your web browser to run the sample -notebooks. See the [documentation](website/docs/reference/docker.md) for more on Docker use. +notebooks. See the [documentation](https://microsoft.github.io/SynapseML/docs/Reference/Docker%20Setup/) for more on Docker use. > To read the EULA for using the docker image, run `docker run -it -p 8888:8888 mcr.microsoft.com/mmlspark/release eula` ### R To try out SynapseML using the R autogenerated wrappers [see our -instructions](website/docs/reference/R-setup.md). Note: This feature is still under development +instructions](https://microsoft.github.io/SynapseML/docs/Reference/R%20Setup/). Note: This feature is still under development and some necessary custom wrappers may be missing. ### C# (.NET) -To try out SynapseML with .NET, please follow the [.NET Installation Guide](website/docs/reference/dotnet-setup.md). +To try out SynapseML with .NET, please follow the [.NET Installation Guide](https://microsoft.github.io/SynapseML/docs/Reference/Dotnet%20Setup/). Please note that some classes including the `AzureSearchWriter`, `DiagnosticInfo`, `UDPyFParam`, `ParamSpaceParam`, `BallTreeParam`, `ConditionalBallTreeParam`, `LightGBMBoosterParam` are still under development and not exposed in .NET yet. ### Building from source SynapseML has recently transitioned to a new build infrastructure. -For detailed developer docs please see the [Developer Readme](website/docs/reference/developer-readme.md) +For detailed developer docs please see the [Developer Readme](https://microsoft.github.io/SynapseML/docs/Reference/Developer%20Setup/) If you are an existing synapsemldeveloper, you will need to reconfigure your development setup. We now support platform independent development and diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb index b65689c4b1d..35be3a8dd3e 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -44,12 +44,12 @@ }, "source": [ "## Introduction\n", - "In this notebook, we'll demonstrate how to develop a context-aware question answering framework for any form of a document using [OpenAI models](https://azure.microsoft.com/en-us/products/ai-services/openai-service), [SynapseML](https://microsoft.github.io/SynapseML/docs/about/) and [Azure AI Services](https://azure.microsoft.com/en-us/products/cognitive-services/). In this notebook, we assume that PDF documents are the source of data, however, the same framework can be easiy extended to other document formats too. \n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework for any form of a document using [OpenAI models](https://azure.microsoft.com/products/ai-services/openai-service), [SynapseML](https://microsoft.github.io/SynapseML/) and [Azure AI Services](https://azure.microsoft.com/products/cognitive-services/). In this notebook, we assume that PDF documents are the source of data, however, the same framework can be easiy extended to other document formats too. \n", "\n", "We’ll cover the following key steps:\n", "\n", - "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", - "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/en-us/products/search), and search the vector store to answer the user’s question.\n", + "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", + "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/products/cognitive-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/products/search), and search the vector store to answer the user’s question.\n", "3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)." ] }, @@ -321,7 +321,7 @@ } }, "source": [ - "We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/en-us/products/ai-services).\n", + "We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/products/ai-services).\n", "\n", "Additionally, we employ AnalyzeDocument from Azure AI Services to extract the complete document content and present it in the designated columns called \"output_content\" and \"paragraph.\"" ] @@ -530,7 +530,7 @@ } }, "source": [ - "To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/embeddings?tabs=console)." + "To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/azure/cognitive-services/openai/how-to/embeddings?tabs=console)." ] }, { @@ -594,7 +594,7 @@ } }, "source": [ - "[Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) offers a user-friendly interface for creating a vector database, as well as storing and retrieving data using vector search. If you're interested in learning more about vector search, you can look [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).\n", + "[Azure Cognitive Search](https://learn.microsoft.com/azure/search/search-what-is-azure-search) offers a user-friendly interface for creating a vector database, as well as storing and retrieving data using vector search. If you're interested in learning more about vector search, you can look [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).\n", "\n", "\n", "Storing data in the AzureCogSearch vector database involves two main steps:\n", @@ -814,7 +814,7 @@ } }, "source": [ - "After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/features/cognitive_services/CognitiveServices%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed." + "After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed." ] }, { From 9eff35f5465269a16f2e936fdc069c6107536805 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Fri, 4 Aug 2023 13:56:33 +0100 Subject: [PATCH 12/23] docs: initial POC of Jessica's fabric doc generator (#2023) * docs: initial POC of Jessica's fabric doc generator * update fabric channel * update fabric channel - rst file * update fabric channel * update fabric channel * add readme, resolve conflict * add install requires * update fabric channel * format channel * add back WebsiteChannel * formatting docgen * Update tools/docgen/docgen/core.py * Update tools/docgen/docgen/core.py * fix index issue * raise warning for if statementmeta in notebookcell output --------- Co-authored-by: Jessica Wang Co-authored-by: JessicaXYWang <108437381+JessicaXYWang@users.noreply.github.com> --- .gitignore | 1 + environment.yml | 5 + tools/docgen/README.md | 72 +++++++ tools/docgen/docgen/channels.py | 302 +++++++++++++++++++++++++++++- tools/docgen/docgen/core.py | 4 +- tools/docgen/docgen/manifest.yaml | 131 +++++++++++++ tools/docgen/setup.py | 10 +- 7 files changed, 515 insertions(+), 10 deletions(-) create mode 100644 tools/docgen/README.md diff --git a/.gitignore b/.gitignore index d6858515cbd..94c129677f1 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,4 @@ metastore_db/ **/build/* **/dist/* **/*.egg-info/* + diff --git a/environment.yml b/environment.yml index 0bba06b1f30..9dac854ab73 100644 --- a/environment.yml +++ b/environment.yml @@ -46,3 +46,8 @@ dependencies: - openai==0.27.5 - black==22.3.0 - black[jupyter]==22.3.0 + - mistletoe + - pypandoc + - markdownify + - traitlets + diff --git a/tools/docgen/README.md b/tools/docgen/README.md new file mode 100644 index 00000000000..266ce001422 --- /dev/null +++ b/tools/docgen/README.md @@ -0,0 +1,72 @@ +# Doc generating pipeline onboarding - Fabric channel + +Please edit the rst file to met Fabric doc requirement + +## Set manifest.yaml + +write a manifest file with filename and metadata +``` +channels: + - name: channels.FabricChannel + input_dir: path to input folder + output_dir: path to output folder + notebooks: + - path: path/under/input/dir/filename1.rst + metadata: + title: title 1 + description: description 1 + ms.topic: eg:overview + ms.custom: build-2023 + ms.reviewer: reviewers' Microsoft alias + author: authors' github usernames + ms.author: authors' Microsoft alias + - path: path/under/input/dir/filename2.ipynb + metadata: + title: title 2 + description: description 2 + ms.topic: eg:overview + ms.custom: build-2023 + ms.reviewer: reviewers' Microsoft alias + author: authors' github usernames + ms.author: authors' Microsoft alias +``` + +## Modify input file + +### Image alt text + +Please add alt text to all the image to meet Fabric doc requirement +#### rst file +For each image, add alt text. + +eg: + +``` +.. image:: + media/an-example.png +``` + +Change it to +``` +.. image:: + media/an-example.png + :alt: the-alt-text-you-want-for this image +``` + +#### Notebook file +Set image url in Notebook (Markdown format): +``` +![image-alt-text](image_url) +``` + +### Remove Locale information from URLs +Please remove all locale information from urls from https://docs.microsoft.com and https://learn.microsoft.com +eg: + +``` +https://learn.microsoft.com/en-us/fabric/onelake/onelake-overview +``` +Change it to +``` +https://learn.microsoft.com/fabric/onelake/onelake-overview +``` \ No newline at end of file diff --git a/tools/docgen/docgen/channels.py b/tools/docgen/docgen/channels.py index 13aa5e38c1e..a1b4b9fb2f3 100644 --- a/tools/docgen/docgen/channels.py +++ b/tools/docgen/docgen/channels.py @@ -1,12 +1,22 @@ -from docgen.core import ParallelChannel -import pathlib -import shutil import os -from nbformat import NotebookNode, read -from nbconvert import MarkdownExporter +import pathlib import re +import shutil +import warnings +from datetime import datetime +from os.path import basename, dirname from typing import List -from os.path import join, dirname, isdir, basename +from urllib.parse import urlparse + +import markdown +import pypandoc +import requests +from bs4 import BeautifulSoup +from docgen.core import Channel, ParallelChannel +from markdownify import ATX, MarkdownConverter +from nbconvert import MarkdownExporter +from nbformat import read +from traitlets.config import Config class WebsiteChannel(ParallelChannel): @@ -25,7 +35,7 @@ def process(self, input_file: str) -> (): if str(input_file).endswith(".ipynb"): output_file = str(output_file).replace(".ipynb", ".md") parsed = read(input_file, as_version=4) - markdown, _ = MarkdownExporter().from_notebook_node(parsed) + markdown, resources = MarkdownExporter().from_notebook_node(parsed) markdown = re.sub(r"style=\"[\S ]*?\"", "", markdown) markdown = re.sub(r"", "", markdown) @@ -41,3 +51,281 @@ def process(self, input_file: str) -> (): else: os.makedirs(dirname(output_file), exist_ok=True) shutil.copy(input_file, output_file) + + +class FabricChannel(Channel): + def __init__(self, input_dir: str, output_dir: str, notebooks: List[dict]): + self.input_dir = input_dir + self.output_dir = output_dir + self.notebooks = notebooks + self.hide_tag = "hide-synapse-internal" + self.media_dir = os.path.join(self.output_dir, "media") + + def list_input_files(self) -> List[str]: + return [n["path"] for n in self.notebooks] + + def _sentence_to_snake(self, path: str): + return ( + path.lower() + .replace(" - ", "-") + .replace(" ", "-") + .replace(",", "") + .replace(".ipynb", "") + .replace(".rst", "") + ) + + def _is_valid_url(self, url): + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except: + return False + + def _replace_img_tag(self, img_tag, img_path_rel): + img_tag.replace_with( + f':::image type="content" source="{img_path_rel}" ' + f'alt-text="{img_tag.get("alt", "placeholder alt text")}":::' + ) + + def _download_and_replace_images( + self, + html_soup, + resources, + output_folder, + relative_to, + notebook_path, + get_image_from_local=False, + ): + output_folder = output_folder.replace("/", os.sep) + os.makedirs(output_folder, exist_ok=True) + + if resources: + # resources converted from notebook + resources_img, i = [], 0 + for img_filename, content in resources.get("outputs", {}).items(): + img_path = os.path.join(output_folder, img_filename.replace("_", "-")) + with open(img_path, "wb") as img_file: + img_file.write(content) + img_path_rel = os.path.relpath(img_path, relative_to).replace( + os.sep, "/" + ) + resources_img.append(img_path_rel) + + img_tags = html_soup.find_all("img") + for img_tag in img_tags: + img_loc = img_tag["src"] + + if self._is_valid_url(img_loc): + # downloaded image + response = requests.get(img_loc) + if response.status_code == 200: + img_filename = self._sentence_to_snake(img_loc.split("/")[-1]) + img_path = os.path.join(output_folder, img_filename) + with open(img_path, "wb") as img_file: + img_file.write(response.content) + img_path_rel = os.path.relpath(img_path, relative_to).replace( + os.sep, "/" + ) + img_tag["src"] = img_path_rel + else: + raise ValueError(f"Could not download image from {img_loc}") + + elif get_image_from_local: + # process local images + img_filename = self._sentence_to_snake(img_loc.split("/")[-1]).replace( + "_", "-" + ) + file_folder = "/".join( + notebook_path.split("/")[:-1] + ) # path read from manifest file + img_input_path = os.path.join( + self.input_dir, file_folder, img_loc + ).replace("/", os.sep) + if not os.path.exists(img_input_path): + raise ValueError(f"Could not get image from {img_loc}") + img_path = os.path.join(output_folder, img_filename) + img_path_rel = os.path.relpath(img_path, relative_to).replace( + os.sep, "/" + ) + shutil.copy(img_input_path, img_path) + + else: + # process image got from notebook resources + img_path_rel = resources_img[i] + img_tag["src"] = img_path_rel + i += 1 + + self._replace_img_tag(img_tag, img_path_rel) + + return html_soup + + def _validate_metadata(self, metadata): + required_metadata = [ + "author", + "description", + "ms.author", + "ms.topic", + "title", + ] + for req in required_metadata: + assert ( + req in metadata.keys() + ), f"{req} is required metadata, please add it to manifest file" + + def _generate_metadata_header(self, metadata): + """ + take a file and the authors name, generate metadata + metadata requirements: https://learn.microsoft.com/contribute/metadata + Azure Doc require MS authors and contributors need to make content contributions through the private repository + so the content can be staged and validated by the current validation rules. (Jan 4th, 2023) + """ + if "ms.date" not in metadata: + update_date = datetime.today().strftime("%m/%d/%Y") + metadata["ms.date"] = update_date + else: + warnings.warn( + "ms.date is set in manifest file, the date won't be automatically updated. " + "to update date automatically, remove ms.date from manifest file" + ) + formatted_list = ( + ["---"] + + ["{k}: {v}".format(k=k, v=v) for k, v in metadata.items()] + + ["---\n"] + ) + return "\n".join(formatted_list) + + def _remove_content(self, text): + patterns_to_remove = [ + "https://docs.microsoft.com", + "https://learn.microsoft.com", + ] + for pattern in patterns_to_remove: + text = re.sub(pattern, "", text) + return text + + def _read_rst(self, rst_file_path): + try: + extra_args = ["--wrap=none"] + html_string = pypandoc.convert_file( + rst_file_path, "html", format="rst", extra_args=extra_args + ) + return html_string + except Exception as e: + print("Error converting the RST file to Markdown:", e) + return None + + def _convert_to_markdown_links(self, parsed_html): + for link in parsed_html.find_all("a", href=True): + href = link["href"] + if not self._is_valid_url(href) and ".md" not in href: + split_href = href.split("#") + split_href[0] += ".md" + new_href = "#".join(split_href) + link["href"] = new_href + return parsed_html + + def process(self, input_file: str, index: int) -> (): + print(f"Processing {input_file} for fabric") + output_file = os.path.join(self.output_dir, input_file) + output_img_dir = self.media_dir + "/" + self._sentence_to_snake(input_file) + full_input_file = os.path.join(self.input_dir, input_file) + notebook_path = self.notebooks[index]["path"] + metadata = self.notebooks[index]["metadata"] + self._validate_metadata(metadata) + + def callback(el): + if el.contents[0].has_attr("class"): + return ( + el.contents[0]["class"][0].split("-")[-1] + if len(el.contents) >= 1 + else None + ) + else: + return el["class"][0] if el.has_attr("class") else None + + def convert_soup_to_md(soup, **options): + return MarkdownConverter(**options).convert_soup(soup) + + if str(input_file).endswith(".rst"): + output_file = self._sentence_to_snake( + str(output_file).replace(".rst", ".md") + ) + html = self._read_rst(full_input_file) + parsed_html = markdown.markdown( + html, + extensions=[ + "markdown.extensions.tables", + "markdown.extensions.fenced_code", + ], + ) + parsed_html = BeautifulSoup(parsed_html) + parsed_html = self._download_and_replace_images( + parsed_html, + None, + output_img_dir, + os.path.dirname(output_file), + notebook_path, + True, + ) + parsed_html = self._convert_to_markdown_links(parsed_html) + + elif str(input_file).endswith(".ipynb"): + output_file = self._sentence_to_snake( + str(output_file).replace(".ipynb", ".md") + ) + parsed = read(full_input_file, as_version=4) + + c = Config() + c.TagRemovePreprocessor.remove_cell_tags = (self.hide_tag,) + c.TagRemovePreprocessor.enabled = True + c.MarkdownExporter.preprocessors = [ + "nbconvert.preprocessors.TagRemovePreprocessor" + ] + md, resources = MarkdownExporter(config=c).from_notebook_node(parsed) + + html = markdown.markdown( + md, + extensions=[ + "markdown.extensions.tables", + "markdown.extensions.fenced_code", + ], + ) + parsed_html = BeautifulSoup(html) + # Download images and place them in media directory while updating their links + parsed_html = self._download_and_replace_images( + parsed_html, + resources, + output_img_dir, + os.path.dirname(output_file), + None, + False, + ) + + # Remove StatementMeta + for element in parsed_html.find_all( + text=re.compile("StatementMeta\(.*?Available\)") + ): + element.extract() + warnings.warn( + f"Found StatementMeta in {input_file}, please check if you want it in the notebook.", + UserWarning, + ) + + # Remove extra CSS styling info + for style_tag in parsed_html.find_all("style"): + style_tag.extract() + + # Convert from HTML to MD + new_md = convert_soup_to_md( + parsed_html, + code_language_callback=callback, + heading_style=ATX, + escape_underscores=False, + ) + # Post processing + new_md = f"{self._generate_metadata_header(metadata)}\n{new_md}" + output_md = self._remove_content(new_md) + + os.makedirs(dirname(output_file), exist_ok=True) + with open(output_file, "w+", encoding="utf-8") as f: + f.write(output_md) diff --git a/tools/docgen/docgen/core.py b/tools/docgen/docgen/core.py index aacb70c5825..452eee25546 100644 --- a/tools/docgen/docgen/core.py +++ b/tools/docgen/docgen/core.py @@ -13,8 +13,8 @@ def list_input_files(self) -> List[str]: pass def run(self) -> (): - for input_file in self.list_input_files(): - self.process(input_file) + for index, input_file in enumerate(self.list_input_files()): + self.process(input_file, index) class ParallelChannel(Channel): diff --git a/tools/docgen/docgen/manifest.yaml b/tools/docgen/docgen/manifest.yaml index 6494e393e0a..96848562c66 100644 --- a/tools/docgen/docgen/manifest.yaml +++ b/tools/docgen/docgen/manifest.yaml @@ -2,3 +2,134 @@ channels: - name: "channels.WebsiteChannel" input_dir: "../../../docs/" output_dir: "../../../website/docs/" + - name: channels.FabricChannel + input_dir: ../../../docs/ + output_dir: ../../../target/fabric-docs-pr/ + notebooks: + - path: Explore Algorithms/AI Services/Multivariate Anomaly Detection.ipynb + metadata: + title: Analyze time series + description: Use SynapseML and Azure Cognitive Services for multivariate anomaly detection. + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/AI Services/Overview.ipynb + metadata: + title: Cognitive Services in Azure Synapse Analytics + description: Enrich your data with artificial intelligence (AI) in Azure Synapse Analytics using pretrained models from Azure Cognitive Services. + ms.topic: overview + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb + metadata: + title: Outlier and Anomaly Detection + description: Use SynapseML on Apache Spark for multivariate anomaly detection with Isolation Forest model. + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.ipynb + metadata: + title: Causal Structure + description: Causal Structure + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.ipynb + filename: classification-before-and-after-synapseml + metadata: + title: Classification - before and after SynapseML + description: Perform the same classification task with and without SynapseML. + ms.topic: how-to + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb + metadata: + title: Train a Text Classifier + description: Train a Text Classifier + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.ipynb + filename: onnx-overview + metadata: + title: ONNX - Inference on Spark + description: Use SynapseML to build a LightGBM model, convert it to ONNX format, then perform inference. + ms.topic: how-to + ms.custom: build-2023 + ms.reviewer: larryfr + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.ipynb + metadata: + title: Hyperparameter tuning + description: Identify the best combination of hyperparameters for your chosen classifiers with SynapseML. + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.ipynb + metadata: + title: LightGBM Overview + description: build LightGBM model with SynapseML + ms.topic: overview + ms.reviewer: mopeakande + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/OpenAI/OpenAI.ipynb + metadata: + title: Azure OpenAI for big data + description: Use Azure OpenAI service to solve a large number of natural language tasks through prompting the completion API. + ms.topic: how-to + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb + metadata: + title: Build a Search Engine + description: Build a custom search engine and question-answering system with SynapseML. + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: JessicaXYWang + - path: Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.ipynb + filename: conditional-k-nearest-neighbors-exploring-art + metadata: + title: Conditional KNN Exploring Art Across Cultures + description: A guideline for match-finding via k-nearest-neighbors. + ms.topic: how-to + ms.custom: build-2023 + ms.reviewer: larryfr + author: JessicaXYWang + ms.author: jessiwang + - path: Explore Algorithms/Responsible AI/Tabular Explainers.ipynb + metadata: + title: Interpretability - Tabular SHAP explainer + description: Use Kernel SHAP to explain a tabular classification model. + ms.topic: overview + ms.custom: build-2023 + ms.reviewer: jessiwang + author: JessicaXYWang + ms.author: jessiwang + - path: Get Started/Quickstart - Your First Models.ipynb + metadata: + title: SynapseMl first model + description: A quick introduction to building your first machine learning model with SynapseML. + ms.topic: how-to + ms.custom: build-2023 + ms.reviewer: mopeakande + author: JessicaXYWang + ms.author: jessiwang \ No newline at end of file diff --git a/tools/docgen/setup.py b/tools/docgen/setup.py index 7ac6f7ff57f..865ecc6dca2 100644 --- a/tools/docgen/setup.py +++ b/tools/docgen/setup.py @@ -27,5 +27,13 @@ zip_safe=True, package_data={"docgen": ["../LICENSE.txt", "../README.txt"]}, python_requires=">=3.8.8", - install_requires=["nbformat", "nbconvert", "pathlib", "argparse"], + install_requires=[ + "nbformat", + "nbconvert", + "pathlib", + "argparse", + "pypandoc", + "markdownify", + "traitlets", + ], ) From 33807625edcf678461c2916086a63fe91cf045a6 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Mon, 7 Aug 2023 14:32:13 +0100 Subject: [PATCH 13/23] docs: fix small error in docgen docs --- tools/docgen/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/docgen/README.md b/tools/docgen/README.md index 266ce001422..866914fbc30 100644 --- a/tools/docgen/README.md +++ b/tools/docgen/README.md @@ -7,7 +7,7 @@ Please edit the rst file to met Fabric doc requirement write a manifest file with filename and metadata ``` channels: - - name: channels.FabricChannel + - name: docgen.channels.FabricChannel input_dir: path to input folder output_dir: path to output folder notebooks: @@ -69,4 +69,4 @@ https://learn.microsoft.com/en-us/fabric/onelake/onelake-overview Change it to ``` https://learn.microsoft.com/fabric/onelake/onelake-overview -``` \ No newline at end of file +``` From f6328b5dbe2a0721e12fdacd12a975a9c89c2494 Mon Sep 17 00:00:00 2001 From: Markus Cozowicz Date: Mon, 7 Aug 2023 20:34:47 +0200 Subject: [PATCH 14/23] fix: improve docgen (#2043) * improve readme add missing package dependency fix channel resolution * revert docgen module prefix * remove module_name prefix --------- --- tools/docgen/README.md | 9 +++++++++ tools/docgen/docgen/__main__.py | 1 + tools/docgen/docgen/channels.py | 2 +- tools/docgen/setup.py | 1 + 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/docgen/README.md b/tools/docgen/README.md index 866914fbc30..fad893969bb 100644 --- a/tools/docgen/README.md +++ b/tools/docgen/README.md @@ -31,6 +31,15 @@ channels: ms.author: authors' Microsoft alias ``` +## Run the tool + +```bash +cd tools/docgen +pip install -e . + +python -m docgen --manifest docgen-manifest.yaml +``` + ## Modify input file ### Image alt text diff --git a/tools/docgen/docgen/__main__.py b/tools/docgen/docgen/__main__.py index 103594a2971..a7f11661423 100644 --- a/tools/docgen/docgen/__main__.py +++ b/tools/docgen/docgen/__main__.py @@ -7,6 +7,7 @@ def instantiate_channel(channel_yml): name = channel_yml["name"] module_name, class_name = name.rsplit(".", 1) + print(f"Instantiating {class_name} from module {module_name}") clazz = getattr(importlib.import_module(module_name), class_name) channel_yml.pop("name") diff --git a/tools/docgen/docgen/channels.py b/tools/docgen/docgen/channels.py index a1b4b9fb2f3..de1b79e02c1 100644 --- a/tools/docgen/docgen/channels.py +++ b/tools/docgen/docgen/channels.py @@ -258,7 +258,7 @@ def convert_soup_to_md(soup, **options): "markdown.extensions.fenced_code", ], ) - parsed_html = BeautifulSoup(parsed_html) + parsed_html = BeautifulSoup(parsed_html, features="html.parser") parsed_html = self._download_and_replace_images( parsed_html, None, diff --git a/tools/docgen/setup.py b/tools/docgen/setup.py index 865ecc6dca2..7d1fe844fcb 100644 --- a/tools/docgen/setup.py +++ b/tools/docgen/setup.py @@ -34,6 +34,7 @@ "argparse", "pypandoc", "markdownify", + "markdown", "traitlets", ], ) From 149c634005a935aef90226bc823c541934245568 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Tue, 8 Aug 2023 14:21:18 +0100 Subject: [PATCH 15/23] docs: add badges to readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 098b4ead27a..c5e78d91242 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,10 @@ SynapseML requires Scala 2.12, Spark 3.2+, and Python 3.8+. | :------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | Build | [![Build Status](https://msdata.visualstudio.com/A365/_apis/build/status/microsoft.SynapseML?branchName=master)](https://msdata.visualstudio.com/A365/_build/latest?definitionId=17563&branchName=master) [![codecov](https://codecov.io/gh/Microsoft/SynapseML/branch/master/graph/badge.svg)](https://codecov.io/gh/Microsoft/SynapseML) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) | | Version | [![Version](https://img.shields.io/badge/version-0.11.2-blue)](https://github.com/Microsoft/SynapseML/releases) [![Release Notes](https://img.shields.io/badge/release-notes-blue)](https://github.com/Microsoft/SynapseML/releases) [![Snapshot Version](https://mmlspark.blob.core.windows.net/icons/badges/master_version3.svg)](#sbt) | -| Docs | [![Scala Docs](https://img.shields.io/static/v1?label=api%20docs&message=scala&color=blue&logo=scala)](https://mmlspark.blob.core.windows.net/docs/0.11.2/scala/index.html#package) [![PySpark Docs](https://img.shields.io/static/v1?label=api%20docs&message=python&color=blue&logo=python)](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/index.html) [![Academic Paper](https://img.shields.io/badge/academic-paper-7fdcf7)](https://arxiv.org/abs/1810.08744) | +| Docs | [![Website](https://img.shields.io/badge/SynapseML-Website-blue)](https://aka.ms/spark) [![Scala Docs](https://img.shields.io/static/v1?label=api%20docs&message=scala&color=blue&logo=scala)](https://mmlspark.blob.core.windows.net/docs/0.11.2/scala/index.html#package) [![PySpark Docs](https://img.shields.io/static/v1?label=api%20docs&message=python&color=blue&logo=python)](https://mmlspark.blob.core.windows.net/docs/0.11.2/pyspark/index.html) [![Academic Paper](https://img.shields.io/badge/academic-paper-7fdcf7)](https://arxiv.org/abs/1810.08744) | | Support | [![Gitter](https://badges.gitter.im/Microsoft/MMLSpark.svg)](https://gitter.im/Microsoft/MMLSpark?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) [![Mail](https://img.shields.io/badge/mail-synapseml--support-brightgreen)](mailto:synapseml-support@microsoft.com) | | Binder | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/microsoft/SynapseML/v0.11.2?labpath=notebooks%2Ffeatures) | +| Usage | [![Downloads](https://static.pepy.tech/badge/synapseml)](https://pepy.tech/project/synapseml) |
From 8f794c896dad5b790356671ac56590fbb4f36eaf Mon Sep 17 00:00:00 2001 From: CRUISE LI Date: Fri, 11 Aug 2023 03:54:42 +0800 Subject: [PATCH 16/23] feat: Support langchain transformer on fabric (#2036) * support langchain transformer on fabric * avoid addtional param * format code --------- Co-authored-by: cruise --- .../cognitive/langchain/LangchainTransform.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cognitive/src/main/python/synapse/ml/cognitive/langchain/LangchainTransform.py b/cognitive/src/main/python/synapse/ml/cognitive/langchain/LangchainTransform.py index cbf6b528b89..fffbf13cdca 100644 --- a/cognitive/src/main/python/synapse/ml/cognitive/langchain/LangchainTransform.py +++ b/cognitive/src/main/python/synapse/ml/cognitive/langchain/LangchainTransform.py @@ -44,6 +44,7 @@ ) from pyspark.sql.functions import udf from typing import cast, Optional, TypeVar, Type +from synapse.ml.core.platform import running_on_synapse_internal OPENAI_API_VERSION = "2022-12-01" RL = TypeVar("RL", bound="MLReadable") @@ -125,6 +126,14 @@ def __init__( self.subscriptionKey = Param(self, "subscriptionKey", "openai api key") self.url = Param(self, "url", "openai api base") self.apiVersion = Param(self, "apiVersion", "openai api version") + self.running_on_synapse_internal = running_on_synapse_internal() + if running_on_synapse_internal(): + from synapse.ml.fabric.service_discovery import get_fabric_env_config + + self._setDefault( + url=get_fabric_env_config().fabric_env_config.ml_workload_endpoint + + "cognitive/openai" + ) kwargs = self._input_kwargs if subscriptionKey: kwargs["subscriptionKey"] = subscriptionKey @@ -196,10 +205,15 @@ def _transform(self, dataset): def udfFunction(x): import openai - openai.api_type = "azure" - openai.api_key = self.getSubscriptionKey() - openai.api_base = self.getUrl() - openai.api_version = self.getApiVersion() + if self.running_on_synapse_internal and not self.isSet(self.url): + from synapse.ml.fabric.prerun.openai_prerun import OpenAIPrerun + + OpenAIPrerun(api_base=self.getUrl()).init_personalized_session(None) + else: + openai.api_type = "azure" + openai.api_key = self.getSubscriptionKey() + openai.api_base = self.getUrl() + openai.api_version = self.getApiVersion() return self.getChain().run(x) outCol = self.getOutputCol() From c6d58829435d8e2ab2cdc1cb1b6a9e0c5d4771b2 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Fri, 11 Aug 2023 14:10:00 +0100 Subject: [PATCH 17/23] chore: remove secret scanner (#2048) --- .../microsoft/azure/synapse/ml/Secrets.scala | 1 - .../ml/core/test/fuzzing/FuzzingTest.scala | 50 ------------------- 2 files changed, 51 deletions(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala index cac166f64f4..9ede008c58b 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala @@ -71,5 +71,4 @@ object Secrets { lazy val ServiceConnectionSecret: String = getSecret("service-connection-secret") lazy val ServicePrincipalClientId: String = getSecret("service-principal-clientId") - lazy val SecretRegexpFile: String = getSecret("secret-regexp-file") } diff --git a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala index e9a9ba0354d..23e9dba6127 100644 --- a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala +++ b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala @@ -382,56 +382,6 @@ class FuzzingTest extends TestBase { } } - test("Scan codebase for secrets") { - val excludedFiles = List( - ".png", - ".jpg", - ".jpeg") - val excludedDirs = List( - ".git", - ".idea", - "target", - ".docusaurus", - "node_modules", - s"website${File.separator}build" - ) - - val regexps: List[Regex] = using(Source.fromURL(Secrets.SecretRegexpFile)) { s => - s.getLines().toList.map(_.r) - }.get - - val allFiles = Files.walk(BuildInfo.baseDirectory.getParentFile.toPath) - .iterator().asScala.map(_.toFile) - .filterNot(f => excludedDirs.exists(dir => f.toString.contains(dir))) - .toList - - val nameIssues = allFiles.flatMap { - case f if regexps.flatMap(_.findFirstMatchIn(f.toString)).nonEmpty => - Some(s"Bad file name: ${f.toString}") - case _ => - None - } - val contentsIssue = allFiles.filter(_.isFile) - .filterNot(f => excludedFiles.exists(end => f.toString.endsWith(end))) - .flatMap { f => - println(f) - try { - val lines = using(Source.fromFile(f)) { s => s.getLines().toList }.get - lines.zipWithIndex.flatMap { case (l, i) => - if (regexps.flatMap(_.findFirstMatchIn(l)).nonEmpty) { - Some(s"Line $i of file ${f.toString} contains secrets") - } else { - None - } - } - } catch { - case _: MalformedInputException => List() - } - } - val allIssues = nameIssues ++ contentsIssue - assert(allIssues.isEmpty, allIssues.mkString("\n")) - } - private def assertOrLog(condition: Boolean, hint: String = "", disableFailure: Boolean = disableFailure): Unit = { if (disableFailure && !condition) println(hint) From 0836e40efd9c48424e91aa10c8aa3fbf0de39f31 Mon Sep 17 00:00:00 2001 From: Scott Votaw Date: Thu, 31 Aug 2023 11:50:17 -0700 Subject: [PATCH 18/23] Fix problem with empty partition assigned to validation data (#2059) --- .../azure/synapse/ml/lightgbm/BasePartitionTask.scala | 3 +++ .../azure/synapse/ml/lightgbm/StreamingPartitionTask.scala | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/BasePartitionTask.scala b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/BasePartitionTask.scala index 030dcfc3170..6dccaa84f60 100644 --- a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/BasePartitionTask.scala +++ b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/BasePartitionTask.scala @@ -324,6 +324,9 @@ abstract class BasePartitionTask extends Serializable with Logging { s" shouldExecuteTraining: $shouldExecuteTraining, isEmptyPartition: $isEmptyPartition") val shouldCalcValidationDataset = trainingCtx.sharedState.validationDatasetWorker.getOrElse(-1) == taskId + if (trainingCtx.hasValidationData) + log.info(s"Validation data found. Task: $taskId, PartId: $partitionId. Main task: $mainExecutorWorkerId" + + s" shouldCalcValidationDataset: $shouldCalcValidationDataset, isEmptyPartition: $isEmptyPartition") PartitionTaskContext(trainingCtx, partitionId, diff --git a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala index c0b7b7812c1..98cac95f519 100644 --- a/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala +++ b/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/StreamingPartitionTask.scala @@ -106,7 +106,7 @@ class StreamingPartitionTask extends BasePartitionTask { if (!shouldExecuteTraining && !isEmptyPartition) ctx.sharedState().incrementDataPrepDoneSignal(log) // First dataset to reach here calculates the validation Dataset if needed - if (ctx.hasValidationData) { + if (ctx.hasValidationData && !isEmptyPartition) { ctx.sharedState().linkValidationDatasetWorker() } } From ebecbe04ee9e1de7b7e9fb4f5955e19852e49373 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Tue, 5 Sep 2023 14:18:23 +0100 Subject: [PATCH 19/23] chore: fix daily midnight build chronjob --- pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/pipeline.yaml b/pipeline.yaml index 83c24f60ffe..509aabb578d 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -26,6 +26,7 @@ pr: schedules: - cron: "0 0 * * *" displayName: Daily midnight build + always: true branches: include: - master From 7e9b0d1e3c6fa5e79ed141315f85d4da9b141b7d Mon Sep 17 00:00:00 2001 From: Brendan Walsh <37676373+BrendanWalsh@users.noreply.github.com> Date: Tue, 5 Sep 2023 06:19:57 -0700 Subject: [PATCH 20/23] fix: fixed broken link to developer readme (#2049) * Fixed broken link for developer setup guide * Fixed broken link for developer setup guide * Fixed broken link for developer setup guide --------- Co-authored-by: Brendan Walsh Co-authored-by: Mark Hamilton --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3fa7eb52e17..12eee07c4a1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -45,7 +45,7 @@ this process: #### Implement tests -- Set up build environment using the [developer guide](https://microsoft.github.io/SynapseML/docs/reference/developer-readme/) +- Set up build environment using the [developer guide](https://microsoft.github.io/SynapseML/docs/Reference/Developer%20Setup/) - Test your code locally. - Add tests using ScalaTests — unit tests are required. - A sample notebook is required as an end-to-end test. From 111823d168519c030bc42e4e83250211b7227b3e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 5 Sep 2023 14:20:38 +0100 Subject: [PATCH 21/23] build: bump actions/checkout from 3 to 4 (#2065) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/check-dead-links.yml | 2 +- .github/workflows/clean-acr.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/scorecards.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-dead-links.yml b/.github/workflows/check-dead-links.yml index faaff7be8bd..10d1fe4320e 100644 --- a/.github/workflows/check-dead-links.yml +++ b/.github/workflows/check-dead-links.yml @@ -15,7 +15,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install dependencies run: | diff --git a/.github/workflows/clean-acr.yml b/.github/workflows/clean-acr.yml index e1e133e9f8a..eb82134c750 100644 --- a/.github/workflows/clean-acr.yml +++ b/.github/workflows/clean-acr.yml @@ -31,7 +31,7 @@ jobs: with: creds: ${{ secrets.clean_acr }} - name: checkout repo content - uses: actions/checkout@v3 # checkout the repo + uses: actions/checkout@v4 # checkout the repo - name: setup python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 5c5a6a6e7c4..524b7024fdf 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -42,7 +42,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index d641547701e..83ddf41ea20 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -32,7 +32,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@v3 # v3.1.0 + uses: actions/checkout@v4 # v3.1.0 with: persist-credentials: false From d494f6e5814eb39233a3eb4f8056eba962bb28f8 Mon Sep 17 00:00:00 2001 From: aydan-at-microsoft <51974608+aydan-at-microsoft@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:11:05 -0700 Subject: [PATCH 22/23] feat: add Azure Cognitive Search vector store (#2041) * add vector column option * add the vector option * vector fields are added and code compiles, untested * fix bug on checkparity when the index exists * add FloatType to edm-spark type conversions * fix synonymmap * core functionality works * add no nested field vector check * add vector validation check * modify vector columns behavior when column doesn't exist in df schema * add another test * clean up the unit test file * add more tests * add openai embedding pipeline test * address comments * address comments * address comments * update notebook * change index name in notebook --- .../ml/cognitive/search/AzureSearch.scala | 112 +++++- .../ml/cognitive/search/AzureSearchAPI.scala | 37 +- .../cognitive/search/AzureSearchSchemas.scala | 35 +- .../cognitive/search/SearchWriterSuite.scala | 321 ++++++++++++++++-- ...ent Question and Answering with PDFs.ipynb | 176 ++-------- 5 files changed, 484 insertions(+), 197 deletions(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearch.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearch.scala index d4db72e3f34..54764d64046 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearch.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearch.scala @@ -18,6 +18,8 @@ import org.apache.spark.internal.{Logging => SLogging} import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.{ComplexParamsReadable, NamespaceInjections, PipelineModel} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.functions.vector_to_array import org.apache.spark.sql.functions.{col, expr, struct, to_json} import org.apache.spark.sql.streaming.DataStreamWriter import org.apache.spark.sql.types._ @@ -142,7 +144,7 @@ class AddDocuments(override val uid: String) extends CognitiveServicesBase(uid) override def responseDataType: DataType = ASResponses.schema } -object AzureSearchWriter extends IndexParser with SLogging { +object AzureSearchWriter extends IndexParser with IndexJsonGetter with SLogging { val Logger: Logger = LogManager.getRootLogger @@ -166,9 +168,11 @@ object AzureSearchWriter extends IndexParser with SLogging { private def convertFields(fields: Seq[StructField], keyCol: String, searchActionCol: String, + vectorCols: Option[Seq[VectorColParams]], prefix: Option[String]): Seq[IndexField] = { fields.filterNot(_.name == searchActionCol).map { sf => val fullName = prefix.map(_ + sf.name).getOrElse(sf.name) + val isVector = vectorCols.exists(_.exists(_.name == fullName)) val (innerType, _) = sparkTypeToEdmType(sf.dataType) IndexField( sf.name, @@ -177,7 +181,9 @@ object AzureSearchWriter extends IndexParser with SLogging { if (keyCol == fullName) Some(true) else None, None, None, None, None, structFieldToSearchFields(sf.dataType, - keyCol, searchActionCol, prefix = Some(prefix.getOrElse("") + sf.name + ".")) + keyCol, searchActionCol, None, prefix = Some(prefix.getOrElse("") + sf.name + ".")), + if (isVector) vectorCols.get.find(_.name == fullName).map(_.dimension) else None, + if (isVector) Some(AzureSearchAPIConstants.VectorConfigName) else None ) } } @@ -185,23 +191,34 @@ object AzureSearchWriter extends IndexParser with SLogging { private def structFieldToSearchFields(schema: DataType, keyCol: String, searchActionCol: String, + vectorCols: Option[Seq[VectorColParams]], prefix: Option[String] = None ): Option[Seq[IndexField]] = { schema match { - case StructType(fields) => Some(convertFields(fields, keyCol, searchActionCol, prefix)) - case ArrayType(StructType(fields), _) => Some(convertFields(fields, keyCol, searchActionCol, prefix)) + case StructType(fields) => Some(convertFields(fields, keyCol, searchActionCol, vectorCols, prefix)) + // TODO: Support vector search in nested fields + case ArrayType(StructType(fields), _) => Some(convertFields(fields, keyCol, searchActionCol, None, prefix)) case _ => None } } + private def parseVectorColsJson(str: String): Seq[VectorColParams] = { + str.parseJson.convertTo[Seq[VectorColParams]] + } + private def dfToIndexJson(schema: StructType, indexName: String, keyCol: String, - searchActionCol: String): String = { + searchActionCol: String, + vectorCols: Option[Seq[VectorColParams]]): String = { + + val vectorConfig = Some(VectorSearch(Seq(AlgorithmConfigs(AzureSearchAPIConstants.VectorConfigName, + AzureSearchAPIConstants.VectorSearchAlgorithm)))) val is = IndexInfo( Some(indexName), - structFieldToSearchFields(schema, keyCol, searchActionCol).get, - None, None, None, None, None, None, None, None + structFieldToSearchFields(schema, keyCol, searchActionCol, vectorCols).get, + None, None, None, None, None, None, None, None, + if (vectorCols.isEmpty) None else vectorConfig ) is.toJson.compactPrint } @@ -210,7 +227,7 @@ object AzureSearchWriter extends IndexParser with SLogging { options: Map[String, String] = Map()): DataFrame = { val applicableOptions = Set( "subscriptionKey", "actionCol", "serviceName", "indexName", "indexJson", - "apiVersion", "batchSize", "fatalErrors", "filterNulls", "keyCol" + "apiVersion", "batchSize", "fatalErrors", "filterNulls", "keyCol", "vectorCols" ) options.keys.foreach(k => @@ -224,11 +241,12 @@ object AzureSearchWriter extends IndexParser with SLogging { val batchSize = options.getOrElse("batchSize", "100").toInt val fatalErrors = options.getOrElse("fatalErrors", "true").toBoolean val filterNulls = options.getOrElse("filterNulls", "false").toBoolean + val vectorColsInfo = options.get("vectorCols") val keyCol = options.get("keyCol") val indexName = options.getOrElse("indexName", parseIndexJson(indexJsonOpt.get).name.get) if (indexJsonOpt.isDefined) { - List("keyCol", "indexName").foreach(opt => + List("keyCol", "indexName", "vectorCols").foreach(opt => assert(!options.contains(opt), s"Cannot set both indexJson options and $opt") ) } @@ -242,22 +260,41 @@ object AzureSearchWriter extends IndexParser with SLogging { } } - val indexJson = indexJsonOpt.getOrElse { - dfToIndexJson(df.schema, indexName, keyCol.get, actionCol) + val (indexJson, preppedDF) = if (getExisting(subscriptionKey, serviceName, apiVersion).contains(indexName)) { + if (indexJsonOpt.isDefined) { + println(f"indexJsonOpt is specified, however an index for $indexName already exists," + + f"we will use the index definition obtained from the existing index instead") + } + val existingIndexJson = getIndexJsonFromExistingIndex(subscriptionKey, serviceName, indexName) + val vectorColNameTypeTuple = getVectorColConf(existingIndexJson) + (existingIndexJson, makeColsCompatible(vectorColNameTypeTuple, df)) + } else if (indexJsonOpt.isDefined) { + val vectorColNameTypeTuple = getVectorColConf(indexJsonOpt.get) + (indexJsonOpt.get, makeColsCompatible(vectorColNameTypeTuple, df)) + } else { + val vectorCols = vectorColsInfo.map(parseVectorColsJson) + val vectorColNameTypeTuple = vectorCols.map(_.map(vc => (vc.name, "Collection(Edm.Single)"))).getOrElse(Seq.empty) + val newDF = makeColsCompatible(vectorColNameTypeTuple, df) + val inferredIndexJson = dfToIndexJson(newDF.schema, indexName, keyCol.getOrElse(""), actionCol, vectorCols) + (inferredIndexJson, newDF) } + // TODO: Support vector search in nested fields + // Throws an exception if any nested field is a vector in the schema + parseIndexJson(indexJson).fields.foreach(_.fields.foreach(assertNoNestedVectors)) + SearchIndex.createIfNoneExists(subscriptionKey, serviceName, indexJson, apiVersion) logInfo("checking schema parity") - checkSchemaParity(df.schema, indexJson, actionCol) + checkSchemaParity(preppedDF.schema, indexJson, actionCol) val df1 = if (filterNulls) { val collectionColumns = parseIndexJson(indexJson).fields .filter(_.`type`.startsWith("Collection")) .map(_.name) - collectionColumns.foldLeft(df) { (ndf, c) => filterOutNulls(ndf, c) } + collectionColumns.foldLeft(preppedDF) { (ndf, c) => filterOutNulls(ndf, c) } } else { - df + preppedDF } new AddDocuments() @@ -273,6 +310,48 @@ object AzureSearchWriter extends IndexParser with SLogging { UDFUtils.oldUdf(checkForErrors(fatalErrors) _, ErrorUtils.ErrorSchema)(col("error"), col("input"))) } + private def assertNoNestedVectors(fields: Seq[IndexField]): Unit = { + def checkVectorField(field: IndexField): Unit = { + if (field.dimensions.nonEmpty && field.vectorSearchConfiguration.nonEmpty) { + throw new IllegalArgumentException(s"Nested field ${field.name} is a vector field, vector fields in nested" + + s" fields are not supported.") + } + field.fields.foreach(_.foreach(checkVectorField)) + } + fields.foreach(checkVectorField) + } + + private def getVectorColConf(indexJson: String): Seq[(String, String)] = { + parseIndexJson(indexJson).fields + .filter(f => f.vectorSearchConfiguration.nonEmpty && f.dimensions.nonEmpty) + .map(f => (f.name, f.`type`)) + } + private def makeColsCompatible(vectorColNameTypeTuple: Seq[(String, String)], + df: DataFrame): DataFrame = { + vectorColNameTypeTuple.foldLeft(df) { case (accDF, (colName, colType)) => + if (!accDF.columns.contains(colName)) { + println(s"Column $colName is specified in either indexJson or vectorCols but not found in dataframe " + + s"columns ${accDF.columns.toList}") + accDF + } + else { + val colDataType = accDF.schema(colName).dataType + assert(colDataType match { + case ArrayType(elementType, _) => elementType == FloatType || elementType == DoubleType + case VectorType => true + case _ => false + }, s"Vector column $colName needs to be one of (ArrayType(FloatType), ArrayType(DoubleType), VectorType)") + if (colDataType.isInstanceOf[ArrayType]) { + accDF.withColumn(colName, accDF(colName).cast(edmTypeToSparkType(colType, None))) + } else { + // first cast vectorUDT to array, then cast it to correct array type + val modifiedDF = accDF.withColumn(colName, vector_to_array(accDF(colName))) + modifiedDF.withColumn(colName, modifiedDF(colName).cast(edmTypeToSparkType(colType, None))) + } + } + } + } + private def isEdmCollection(t: String): Boolean = { t.startsWith("Collection(") && t.endsWith(")") } @@ -290,6 +369,7 @@ object AzureSearchWriter extends IndexParser with SLogging { case "Edm.Int64" => LongType case "Edm.Int32" => IntegerType case "Edm.Double" => DoubleType + case "Edm.Single" => FloatType case "Edm.DateTimeOffset" => StringType //See if there's a way to use spark datetimes case "Edm.GeographyPoint" => StringType case "Edm.ComplexType" => StructType(fields.get.map(f => @@ -310,10 +390,12 @@ object AzureSearchWriter extends IndexParser with SLogging { case IntegerType => ("Edm.Int32", None) case LongType => ("Edm.Int64", None) case DoubleType => ("Edm.Double", None) + case FloatType => ("Edm.Single", None) case DateType => ("Edm.DateTimeOffset", None) case StructType(fields) => ("Edm.ComplexType", Some(fields.map { f => val (innerType, innerFields) = sparkTypeToEdmType(f.dataType) - IndexField(f.name, innerType, None, None, None, None, None, None, None, None, None, None, innerFields) + IndexField(f.name, innerType, None, None, None, None, None, None, None, None, None, None, innerFields, + None, None) // TODO: Support vector search in nested fields })) } } diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchAPI.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchAPI.scala index 9a9860857ee..f30ab9cd923 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchAPI.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchAPI.scala @@ -14,7 +14,9 @@ import spray.json._ import scala.util.{Failure, Success, Try} object AzureSearchAPIConstants { - val DefaultAPIVersion = "2019-05-06" + val DefaultAPIVersion = "2023-07-01-Preview" + val VectorConfigName = "vectorConfig" + val VectorSearchAlgorithm = "hnsw" } import com.microsoft.azure.synapse.ml.cognitive.search.AzureSearchAPIConstants._ @@ -39,6 +41,26 @@ trait IndexLister { } } +trait IndexJsonGetter extends IndexLister { + def getIndexJsonFromExistingIndex(key: String, + serviceName: String, + indexName: String, + apiVersion: String = DefaultAPIVersion): String = { + val existingIndexNames = getExisting(key, serviceName, apiVersion) + assert(existingIndexNames.contains(indexName), s"Cannot find an existing index name with $indexName") + + val indexJsonRequest = new HttpGet( + s"https://$serviceName.search.windows.net/indexes/$indexName?api-version=$apiVersion" + ) + indexJsonRequest.setHeader("api-key", key) + indexJsonRequest.setHeader("Content-Type", "application/json") + val indexJsonResponse = safeSend(indexJsonRequest, close = false) + val indexJson = IOUtils.toString(indexJsonResponse.getEntity.getContent, "utf-8") + indexJsonResponse.close() + indexJson + } +} + object SearchIndex extends IndexParser with IndexLister { import AzureSearchProtocol._ @@ -94,7 +116,9 @@ object SearchIndex extends IndexParser with IndexLister { _ <- validAnalyzer(field.analyzer, field.searchAnalyzer, field.indexAnalyzer) _ <- validSearchAnalyzer(field.analyzer, field.searchAnalyzer, field.indexAnalyzer) _ <- validIndexAnalyzer(field.analyzer, field.searchAnalyzer, field.indexAnalyzer) - _ <- validSynonymMaps(field.synonymMap) + _ <- validVectorField(field.dimensions, field.vectorSearchConfiguration) + // TODO: Fix and add back validSynonymMaps check. SynonymMaps needs to be Option[Seq[String]] type + //_ <- validSynonymMaps(field.synonymMap) } yield field } @@ -182,6 +206,15 @@ object SearchIndex extends IndexParser with IndexLister { } } + private def validVectorField(d: Option[Int], v: Option[String]): Try[Option[String]] = { + if ((d.isDefined && v.isEmpty) || (v.isDefined && d.isEmpty)) { + Failure(new IllegalArgumentException("Both dimensions and vectorSearchConfig fields need to be defined for " + + "vector search")) + } else { + Success(v) + } + } + def getStatistics(indexName: String, key: String, serviceName: String, diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchSchemas.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchSchemas.scala index a8d9142e093..7b0612330c0 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchSchemas.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/search/AzureSearchSchemas.scala @@ -5,7 +5,7 @@ package com.microsoft.azure.synapse.ml.cognitive.search import com.microsoft.azure.synapse.ml.core.schema.SparkBindings import spray.json.DefaultJsonProtocol._ -import spray.json.{JsonFormat, RootJsonFormat} +import spray.json.{DefaultJsonProtocol, JsonFormat, RootJsonFormat} object ASResponses extends SparkBindings[ASResponses] @@ -23,9 +23,19 @@ case class IndexInfo( tokenizers: Option[Seq[String]], tokenFilters: Option[Seq[String]], defaultScoringProfile: Option[Seq[String]], - corsOptions: Option[Seq[String]] + corsOptions: Option[Seq[String]], + vectorSearch: Option[VectorSearch] ) +case class AlgorithmConfigs( + name: String, + kind: String + ) + +case class VectorSearch( + algorithmConfigurations: Seq[AlgorithmConfigs] + ) + case class IndexField( name: String, `type`: String, @@ -38,21 +48,32 @@ case class IndexField( analyzer: Option[String], searchAnalyzer: Option[String], indexAnalyzer: Option[String], - synonymMap: Option[String], - fields: Option[Seq[IndexField]] + synonymMap: Option[Seq[String]], + fields: Option[Seq[IndexField]], + dimensions: Option[Int], + vectorSearchConfiguration: Option[String] ) +case class VectorColParams( + name: String, + dimension: Int + ) + case class IndexStats(documentCount: Int, storageSize: Int) case class IndexList(`@odata.context`: String, value: Seq[IndexName]) case class IndexName(name: String) -object AzureSearchProtocol { +object AzureSearchProtocol extends DefaultJsonProtocol { implicit val IfEnc: JsonFormat[IndexField] = lazyFormat(jsonFormat( IndexField,"name","type","searchable","filterable","sortable", - "facetable","retrievable", "key","analyzer","searchAnalyzer", "indexAnalyzer", "synonymMaps", "fields")) - implicit val IiEnc: RootJsonFormat[IndexInfo] = jsonFormat10(IndexInfo.apply) + "facetable","retrievable", "key","analyzer","searchAnalyzer", "indexAnalyzer", "synonymMaps", "fields", + "dimensions", "vectorSearchConfiguration")) + implicit val AcEnc: RootJsonFormat[AlgorithmConfigs] = jsonFormat2(AlgorithmConfigs.apply) + implicit val VsEnc: RootJsonFormat[VectorSearch] = jsonFormat1(VectorSearch.apply) + implicit val IiEnc: RootJsonFormat[IndexInfo] = jsonFormat11(IndexInfo.apply) implicit val IsEnc: RootJsonFormat[IndexStats] = jsonFormat2(IndexStats.apply) implicit val InEnc: RootJsonFormat[IndexName] = jsonFormat1(IndexName.apply) implicit val IlEnc: RootJsonFormat[IndexList] = jsonFormat2(IndexList.apply) + implicit val VcpEnc: RootJsonFormat[VectorColParams] = jsonFormat2(VectorColParams.apply) } diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/search/SearchWriterSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/search/SearchWriterSuite.scala index 433a0f17edd..2a92b78d12e 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/search/SearchWriterSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/search/SearchWriterSuite.scala @@ -5,6 +5,7 @@ package com.microsoft.azure.synapse.ml.cognitive.search import com.microsoft.azure.synapse.ml.Secrets import com.microsoft.azure.synapse.ml.cognitive._ +import com.microsoft.azure.synapse.ml.cognitive.openai.{OpenAIAPIKey, OpenAIEmbedding} import com.microsoft.azure.synapse.ml.cognitive.vision.AnalyzeImage import com.microsoft.azure.synapse.ml.core.test.base.TestBase import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing} @@ -12,6 +13,7 @@ import com.microsoft.azure.synapse.ml.io.http.RESTHelpers._ import org.apache.http.client.methods.HttpDelete import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.DataFrame +import org.apache.spark.ml.linalg.Vectors import java.time.LocalDateTime import java.time.format.{DateTimeFormatterBuilder, DateTimeParseException, SignStyle} @@ -25,8 +27,8 @@ trait AzureSearchKey { } //scalastyle:off null -class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister - with TransformerFuzzing[AddDocuments] with CognitiveKey { +class SearchWriterSuite extends TestBase with AzureSearchKey with IndexJsonGetter with IndexParser + with TransformerFuzzing[AddDocuments] with CognitiveKey with OpenAIAPIKey { import spark.implicits._ @@ -44,6 +46,12 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister .toDF("searchAction", "id", "fileName", "text") } + private def createTestDataWithVector(numDocs: Int): DataFrame = { + (0 until numDocs) + .map(i => ("upload", s"$i", s"file$i", Array(0.001, 0.002, 0.003).map(_ * i))) + .toDF("searchAction", "id", "fileName", "vectorCol") + } + private def createSimpleIndexJson(indexName: String): String = { s""" |{ @@ -74,6 +82,43 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister """.stripMargin } + private def createSimpleIndexJsonWithVector(indexName: String): String = { + s""" + |{ + | "name": "$indexName", + | "fields": [ + | { + | "name": "id", + | "type": "Edm.String", + | "key": true, + | "facetable": false + | }, + | { + | "name": "fileName", + | "type": "Edm.String", + | "searchable": false, + | "sortable": false, + | "facetable": false + | }, + | { + | "name": "vectorCol", + | "type": "Collection(Edm.Single)", + | "dimensions": 3, + | "vectorSearchConfiguration": "vectorConfig" + | } + | ], + | "vectorSearch": { + | "algorithmConfigurations": [ + | { + | "name": "vectorConfig", + | "kind": "hnsw" + | } + | ] + | } + | } + """.stripMargin + } + private val createdIndexes: mutable.ListBuffer[String] = mutable.ListBuffer() private def generateIndexName(): String = { @@ -105,7 +150,7 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister println("Cleaning up services") val successfulCleanup = getExisting(azureSearchKey, testServiceName) .intersect(createdIndexes).map { n => - deleteIndex(n) + deleteIndex(n) }.forall(_ == 204) cleanOldIndexes() super.afterAll() @@ -173,12 +218,15 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister def writeHelper(df: DataFrame, indexName: String, + isVectorField: Boolean, extraParams: Map[String, String] = Map()): Unit = { + val indexJson = if (isVectorField) createSimpleIndexJsonWithVector(indexName) else createSimpleIndexJson(indexName) AzureSearchWriter.write(df, Map("subscriptionKey" -> azureSearchKey, "actionCol" -> "searchAction", "serviceName" -> testServiceName, - "indexJson" -> createSimpleIndexJson(indexName)) ++ extraParams) + "indexJson" -> indexJson) + ++ extraParams) } def assertSize(indexName: String, size: Int): Unit = { @@ -186,15 +234,15 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister () } - ignore("clean up all search indexes"){ + ignore("clean up all search indexes") { getExisting(azureSearchKey, testServiceName) .foreach { n => - val deleteRequest = new HttpDelete( - s"https://$testServiceName.search.windows.net/indexes/$n?api-version=2017-11-11") - deleteRequest.setHeader("api-key", azureSearchKey) - val response = safeSend(deleteRequest) - println(s"Deleted index $n, status code ${response.getStatusLine.getStatusCode}") - } + val deleteRequest = new HttpDelete( + s"https://$testServiceName.search.windows.net/indexes/$n?api-version=2017-11-11") + deleteRequest.setHeader("api-key", azureSearchKey) + val response = safeSend(deleteRequest) + println(s"Deleted index $n, status code ${response.getStatusLine.getStatusCode}") + } } test("Run azure-search tests with waits") { @@ -209,17 +257,17 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister //create new index and add docs lazy val in1 = generateIndexName() - dependsOn(1, writeHelper(df4, in1)) + dependsOn(1, writeHelper(df4, in1, isVectorField=false)) //push docs to existing index lazy val in2 = generateIndexName() lazy val dfA = df10.limit(4) lazy val dfB = df10.except(dfA) - dependsOn(2, writeHelper(dfA, in2)) + dependsOn(2, writeHelper(dfA, in2, isVectorField=false)) dependsOn(2, retryWithBackoff({ if (getExisting(azureSearchKey, testServiceName).contains(in2)) { - writeHelper(dfB, in2) + writeHelper(dfB, in2, isVectorField=false) } else { throw new RuntimeException("No existing service found") } @@ -227,7 +275,7 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister //push docs with custom batch size lazy val in3 = generateIndexName() - dependsOn(3, writeHelper(bigDF, in3, Map("batchSize" -> "2000"))) + dependsOn(3, writeHelper(bigDF, in3, isVectorField=false, Map("batchSize" -> "2000"))) dependsOn(1, retryWithBackoff(assertSize(in1, 4))) dependsOn(2, retryWithBackoff(assertSize(in2, 10))) @@ -276,17 +324,17 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister .map { i => ("upload", s"$i", s"file$i", s"text$i") } .toDF("searchAction", "badkeyname", "fileName", "text") assertThrows[IllegalArgumentException] { - writeHelper(mismatchDF, generateIndexName()) + writeHelper(mismatchDF, generateIndexName(), isVectorField=false) } } /** - * All the Edm Types are nullable in Azure Search except for Collection(Edm.String). - * Because it is not possible to store a null value in a Collection(Edm.String) field, - * there is an option to set a boolean flag, filterNulls, that will remove null values - * from the dataset in the Collection(Edm.String) fields before writing the data to the search index. - * The default value for this boolean flag is False. - */ + * All the Edm Types are nullable in Azure Search except for Collection(Edm.String). + * Because it is not possible to store a null value in a Collection(Edm.String) field, + * there is an option to set a boolean flag, filterNulls, that will remove null values + * from the dataset in the Collection(Edm.String) fields before writing the data to the search index. + * The default value for this boolean flag is False. + */ test("Handle null values for Collection(Edm.String) fields") { val in = generateIndexName() val phraseIndex = @@ -387,4 +435,233 @@ class SearchWriterSuite extends TestBase with AzureSearchKey with IndexLister retryWithBackoff(assertSize(in, 2)) } + test("Run azure-search tests with vector fields") { + val in1 = generateIndexName() + val vectorDF4 = createTestDataWithVector(4) + + writeHelper(vectorDF4, in1, isVectorField=true) + + val in2 = generateIndexName() + val vectorDF10 = createTestDataWithVector(10) + val dfA = vectorDF10.limit(4) + val dfB = vectorDF10.except(dfA) + + writeHelper(dfA, in2, isVectorField=true) + + retryWithBackoff({ + if (getExisting(azureSearchKey, testServiceName).contains(in2)) { + writeHelper(dfB, in2, isVectorField=true) + } else { + throw new RuntimeException("No existing service found") + } + }) + + retryWithBackoff(assertSize(in1, 4)) + retryWithBackoff(assertSize(in2, 10)) + + val indexJson = retryWithBackoff(getIndexJsonFromExistingIndex(azureSearchKey, testServiceName, in1)) + // assert if vectorCol is a vector field + assert(parseIndexJson(indexJson).fields.find(_.name == "vectorCol").get.vectorSearchConfiguration.nonEmpty) + } + + test("Infer the structure of the index from the dataframe with vector columns") { + val in = generateIndexName() + val phraseDF = Seq( + ("upload", "0", "file0", Array(1.1, 2.1, 3.1), Vectors.dense(0.11, 0.21, 0.31), + Vectors.sparse(3, Array(0, 1, 2), Array(0.11, 0.21, 0.31))), + ("upload", "1", "file1", Array(1.2, 2.2, 3.2), Vectors.dense(0.12, 0.22, 0.32), + Vectors.sparse(3, Array(0, 1, 2), Array(0.11, 0.21, 0.31)))) + .toDF("searchAction", "id", "fileName", "vectorCol1", "vectorCol2", "vectorCol3") + + val vectorCols = + """ + |[ + | {"name": "vectorCol1", "dimension": 3}, + | {"name": "vectorCol2", "dimension": 3}, + | {"name": "vectorCol3", "dimension": 3} + |] + |""".stripMargin + + AzureSearchWriter.write(phraseDF, + Map( + "subscriptionKey" -> azureSearchKey, + "actionCol" -> "searchAction", + "serviceName" -> testServiceName, + "filterNulls" -> "true", + "indexName" -> in, + "keyCol" -> "id", + "vectorCols" -> vectorCols + )) + + retryWithBackoff(assertSize(in, 2)) + + // assert if vectorCols are a vector field + val indexJson = retryWithBackoff(getIndexJsonFromExistingIndex(azureSearchKey, testServiceName, in)) + assert(parseIndexJson(indexJson).fields.find(_.name == "vectorCol1").get.vectorSearchConfiguration.nonEmpty) + assert(parseIndexJson(indexJson).fields.find(_.name == "vectorCol2").get.vectorSearchConfiguration.nonEmpty) + assert(parseIndexJson(indexJson).fields.find(_.name == "vectorCol3").get.vectorSearchConfiguration.nonEmpty) + } + + test("Throw useful error when given vector columns in nested fields") { + val in = generateIndexName() + val badJson = + s""" + |{ + | "name": "$in", + | "fields": [ + | { + | "name": "id", + | "type": "Edm.String", + | "key": true, + | "facetable": false + | }, + | { + | "name": "someCollection", + | "type": "Edm.String" + | }, + | { + | "name": "complexField", + | "type": "Edm.ComplexType", + | "fields": [ + | { + | "name": "StreetAddress", + | "type": "Edm.String" + | }, + | { + | "name": "contentVector", + | "type": "Collection(Edm.Single)", + | "dimensions": 3, + | "vectorSearchConfiguration": "vectorConfig" + | } + | ] + | } + | ] + |} + """.stripMargin + + assertThrows[IllegalArgumentException] { + AzureSearchWriter.write(df4, + Map( + "subscriptionKey" -> azureSearchKey, + "actionCol" -> "searchAction", + "serviceName" -> testServiceName, + "filterNulls" -> "true", + "indexJson" -> badJson + )) + } + } + + test("Throw useful error when one of dimensions or vectorSearchConfig is not defined") { + val in = generateIndexName() + val badJson = + s""" + |{ + | "name": "$in", + | "fields": [ + | { + | "name": "id", + | "type": "Edm.String", + | "key": true, + | "facetable": false + | }, + | { + | "name": "someCollection", + | "type": "Edm.String" + | }, + | { + | "name": "contentVector", + | "type": "Collection(Edm.Single)", + | "dimensions": 3 + | } + | ] + |} + """.stripMargin + + assertThrows[IllegalArgumentException] { + SearchIndex.createIfNoneExists(azureSearchKey, testServiceName, badJson) + } + } + + test("Handle non-existent vector column specified in vectorCols option") { + val in = generateIndexName() + val phraseDF = Seq( + ("upload", "0", "file0"), + ("upload", "1", "file1")) + .toDF("searchAction", "id", "fileName") + + AzureSearchWriter.write(phraseDF, + Map( + "subscriptionKey" -> azureSearchKey, + "actionCol" -> "searchAction", + "serviceName" -> testServiceName, + "indexName" -> in, + "keyCol" -> "id", + "vectorCols" -> """[{"name": "vectorCol", "dimension": 3}]""" + )) + + retryWithBackoff(assertSize(in, 2)) + } + + test("Handle non-existing vector column specified in index JSON option") { + val in = generateIndexName() + val phraseDF = Seq( + ("upload", "0", "file0"), + ("upload", "1", "file1")) + .toDF("searchAction", "id", "fileName") + + AzureSearchWriter.write(phraseDF, + Map( + "subscriptionKey" -> azureSearchKey, + "actionCol" -> "searchAction", + "serviceName" -> testServiceName, + "indexJson" -> createSimpleIndexJsonWithVector(in) + )) + + retryWithBackoff(assertSize(in, 2)) + } + + test("Throw useful error when the vector column is an unsupported type") { + val in = generateIndexName() + val badDF = Seq( + ("upload", "0", "file0", Array("p1", "p2", "p3")), + ("upload", "1", "file1", Array("p4", "p5", "p6"))) + .toDF("searchAction", "id", "fileName", "vectorCol") + + assertThrows[AssertionError] { + writeHelper(badDF, in, isVectorField=true) + } + } + + test("pipeline with openai embedding") { + val in = generateIndexName() + + val df = Seq( + ("upload", "0", "this is the first sentence"), + ("upload", "1", "this is the second sentence") + ).toDF("searchAction", "id", "content") + + val tdf = new OpenAIEmbedding() + .setSubscriptionKey(openAIAPIKey) + .setDeploymentName("text-embedding-ada-002") + .setCustomServiceName(openAIServiceName) + .setTextCol("content") + .setErrorCol("error") + .setOutputCol("vectorContent") + .transform(df) + .drop("error") + + AzureSearchWriter.write(tdf, + Map( + "subscriptionKey" -> azureSearchKey, + "actionCol" -> "searchAction", + "serviceName" -> testServiceName, + "indexName" -> in, + "keyCol" -> "id", + "vectorCols" -> """[{"name": "vectorContent", "dimension": 1536}]""" + )) + + retryWithBackoff(assertSize(in, 2)) + val indexJson = retryWithBackoff(getIndexJsonFromExistingIndex(azureSearchKey, testServiceName, in)) + assert(parseIndexJson(indexJson).fields.find(_.name == "vectorContent").get.vectorSearchConfiguration.nonEmpty) + } } diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb index 35be3a8dd3e..6d248ed270f 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -148,7 +148,7 @@ "\n", "# Azure Cognitive Search\n", "cogsearch_name = \"mmlspark-azure-search\"\n", - "cogsearch_index_name = \"exampleindex\"\n", + "cogsearch_index_name = \"examplevectorindex\"\n", "cogsearch_api_key = find_secret(\"azure-search-key\")" ] }, @@ -612,12 +612,15 @@ "metadata": {}, "outputs": [], "source": [ - "# Import necessary packages\n", - "import requests\n", - "import json\n", - "\n", - "EMBEDDING_LENGTH = (\n", - " 1536 # length of the embedding vector (OpenAI generates embeddings of length 1536)\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "from pyspark.sql.functions import lit\n", + "\n", + "df_embeddings = (\n", + " df_embeddings.drop(\"error\")\n", + " .withColumn(\n", + " \"idx\", monotonically_increasing_id().cast(\"string\")\n", + " ) # create index ID for ACS\n", + " .withColumn(\"searchAction\", lit(\"upload\"))\n", ")" ] }, @@ -627,148 +630,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Create Index for Cog Search with fields as id, content, and contentVector\n", - "# Note the datatypes for each field below\n", - "\n", - "url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}?api-version=2023-07-01-Preview\"\n", - "payload = json.dumps(\n", - " {\n", - " \"name\": cogsearch_index_name,\n", - " \"fields\": [\n", - " {\"name\": \"id\", \"type\": \"Edm.String\", \"key\": True, \"filterable\": True},\n", - " {\n", - " \"name\": \"content\",\n", - " \"type\": \"Edm.String\",\n", - " \"searchable\": True,\n", - " \"retrievable\": True,\n", - " },\n", - " {\n", - " \"name\": \"contentVector\",\n", - " \"type\": \"Collection(Edm.Single)\",\n", - " \"searchable\": True,\n", - " \"retrievable\": True,\n", - " \"dimensions\": EMBEDDING_LENGTH,\n", - " \"vectorSearchConfiguration\": \"vectorConfig\",\n", - " },\n", - " ],\n", - " \"vectorSearch\": {\n", - " \"algorithmConfigurations\": [\n", - " {\n", - " \"name\": \"vectorConfig\",\n", - " \"kind\": \"hnsw\",\n", - " }\n", - " ]\n", - " },\n", - " }\n", - ")\n", - "headers = {\"Content-Type\": \"application/json\", \"api-key\": cogsearch_api_key}\n", - "\n", - "response = requests.request(\"PUT\", url, headers=headers, data=payload)\n", - "print(response.status_code)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "07396763-74c3-4299-8976-e15e6d510d47", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We need to use User Defined Function (UDF) through the udf() method in order to apply functions directly to the DataFrames and SQL databases in Python, without any need to individually register them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use Spark's UDF to insert entries to Cognitive Search\n", - "# This allows to run the code in a distributed fashion\n", - "\n", - "# Define a UDF using the @udf decorator\n", - "@udf(returnType=StringType())\n", - "def insert_to_cog_search(idx, content, contentVector):\n", - " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview\"\n", - "\n", - " payload = json.dumps(\n", - " {\n", - " \"value\": [\n", - " {\n", - " \"id\": str(idx),\n", - " \"content\": content,\n", - " \"contentVector\": contentVector.tolist(),\n", - " \"@search.action\": \"upload\",\n", - " },\n", - " ]\n", - " }\n", - " )\n", - " headers = {\n", - " \"Content-Type\": \"application/json\",\n", - " \"api-key\": cogsearch_api_key,\n", - " }\n", - "\n", - " response = requests.request(\"POST\", url, headers=headers, data=payload)\n", - " # response.text\n", - "\n", - " if response.status_code == 200 or response.status_code == 201:\n", - " return \"Success\"\n", - " else:\n", - " return \"Failure\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "42688e00-98fb-406e-9f19-c89fed3248ef", - "showTitle": false, - "title": "" - } - }, - "source": [ - "In the following, we apply UDF to different columns. Note that UDF also helps to add new columns to the DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Apply the UDF on the different columns\n", - "from pyspark.sql.functions import monotonically_increasing_id\n", - "\n", - "df_embeddings = df_embeddings.withColumn(\n", - " \"idx\", monotonically_increasing_id()\n", - ") ## adding a column with id\n", - "df_embeddings = df_embeddings.withColumn(\n", - " \"errorCogSearch\",\n", - " insert_to_cog_search(\n", - " df_embeddings[\"idx\"], df_embeddings[\"chunk\"], df_embeddings[\"embeddings\"]\n", - " ),\n", - ")\n", + "from synapse.ml.cognitive import writeToAzureSearch\n", + "import json\n", "\n", - "# Show the transformed DataFrame\n", - "df_embeddings.show()" + "df_embeddings.writeToAzureSearch(\n", + " subscriptionKey=cogsearch_api_key,\n", + " actionCol=\"searchAction\",\n", + " serviceName=cogsearch_name,\n", + " indexName=cogsearch_index_name,\n", + " keyCol=\"idx\",\n", + " vectorCols=json.dumps([{\"name\": \"embeddings\", \"dimension\": 1536}]),\n", + ")" ] }, { @@ -833,6 +705,8 @@ "metadata": {}, "outputs": [], "source": [ + "import requests\n", + "\n", "# Ask a question and convert to embeddings\n", "\n", "\n", @@ -861,7 +735,7 @@ " url = f\"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview\"\n", "\n", " payload = json.dumps(\n", - " {\"vector\": {\"value\": question_embedding, \"fields\": \"contentVector\", \"k\": 2}}\n", + " {\"vector\": {\"value\": question_embedding, \"fields\": \"embeddings\", \"k\": k}}\n", " )\n", " headers = {\n", " \"Content-Type\": \"application/json\",\n", @@ -996,7 +870,7 @@ "\n", "\n", "# Concatenate the content of retrieved documents\n", - "context = [i[\"content\"] for i in output[\"value\"]]\n", + "context = [i[\"chunk\"] for i in output[\"value\"]]\n", "\n", "# Make a Quesion Answer chain function and pass\n", "qa_chain = qa_chain_func()\n", @@ -1012,5 +886,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 5 } From 3d1a7fc4c4a3e7177f4863c930e4595859bf5413 Mon Sep 17 00:00:00 2001 From: Brendan Walsh <37676373+BrendanWalsh@users.noreply.github.com> Date: Mon, 11 Sep 2023 15:38:21 -0700 Subject: [PATCH 23/23] fix: updated gpt-review to version 0.9.5 to fix break (#2069) Verified that 0.9.4 has a dependency issues and reached out to owner who kindly cut a new release --- .github/workflows/on-pull-request-target-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/on-pull-request-target-review.yml b/.github/workflows/on-pull-request-target-review.yml index d901d9d3269..47e569c66eb 100644 --- a/.github/workflows/on-pull-request-target-review.yml +++ b/.github/workflows/on-pull-request-target-review.yml @@ -10,7 +10,7 @@ jobs: name: Azure OpenAI PR Comment steps: - id: review - uses: microsoft/gpt-review@v0.9.4 + uses: microsoft/gpt-review@v0.9.5 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} AZURE_OPENAI_API: ${{ secrets.AZURE_OPENAI_API }}