From 0ad58b71d90c61e60a0afedbc742c0a9512bd123 Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Mon, 14 Oct 2024 12:44:51 -0400 Subject: [PATCH] chore: add 1.0.7 docs (#2299) --- .../version-1.0.7/Deploy Models/Overview.md | 228 ++++ .../Quickstart - Deploying a Classifier.md | 114 ++ ... Usage - Async, Batching, and Multi-Key.md | 149 +++ .../AI Services/Geospatial Services.md | 276 +++++ .../Multivariate Anomaly Detection.md | 321 +++++ .../AI Services/Overview.md | 563 +++++++++ .../Quickstart - Analyze Celebrity Quotes.md | 162 +++ .../AI Services/Quickstart - Analyze Text.md | 273 +++++ .../Quickstart - Create Audiobooks.md | 121 ++ ...ckstart - Create a Visual Search Engine.md | 95 ++ ...cument Question and Answering with PDFs.md | 337 ++++++ .../AI Services/Quickstart - Flooding Risk.md | 193 +++ .../Quickstart - Predictive Maintenance.md | 188 +++ .../Quickstart - Isolation Forests.md | 505 ++++++++ .../Causal Inference/Overview.md | 61 + .../Quickstart - Measure Causal Effects.md | 91 ++ ...ckstart - Measure Heterogeneous Effects.md | 143 +++ ...t - Synthetic difference in differences.md | 231 ++++ .../Quickstart - SparkML vs SynapseML.md | 216 ++++ .../Quickstart - Train Classifier.md | 55 + ...ckstart - Vowpal Wabbit on Tabular Data.md | 82 ++ ...Quickstart - Vowpal Wabbit on Text Data.md | 213 ++++ .../Deep Learning/Distributed Training.md | 76 ++ .../Deep Learning/Getting Started.md | 42 + .../Explore Algorithms/Deep Learning/ONNX.md | 108 ++ ...uickstart - Fine-tune a Text Classifier.md | 105 ++ ...ckstart - Fine-tune a Vision Classifier.md | 139 +++ .../Quickstart - ONNX Model Inference.md | 172 +++ ...Transfer Learn for Image Classification.md | 139 +++ .../Hyperparameter Tuning/HyperOpt.md | 335 ++++++ .../Quickstart - Random Search.md | 99 ++ .../Explore Algorithms/LightGBM/Overview.md | 262 +++++ ...Classification, Ranking, and Regression.md | 302 +++++ .../Explore Algorithms/OpenAI/Langchain.md | 236 ++++ .../Explore Algorithms/OpenAI/OpenAI.md | 294 +++++ ...m Embeddings and Approximate KNN on GPU.md | 215 ++++ ...rt - OpenAI Embedding and GPU based KNN.md | 202 ++++ .../OpenAI/Quickstart - OpenAI Embedding.md | 207 ++++ ...uickstart - Understand and Search Forms.md | 376 ++++++ .../OpenCV/Image Transformations.md | 119 ++ .../Other Algorithms/Cyber ML.md | 82 ++ ...Quickstart - Anomalous Access Detection.md | 348 ++++++ ...ckstart - Exploring Art Across Cultures.md | 221 ++++ .../Smart Adaptive Recommendations.md | 183 +++ .../Regression/Quickstart - Data Cleaning.md | 151 +++ .../Quickstart - Train Regressor.md | 220 ++++ ...Quickstart - Vowpal Wabbit and LightGBM.md | 236 ++++ .../Responsible AI/Data Balance Analysis.md | 215 ++++ .../Responsible AI/Explanation Dashboard.md | 222 ++++ .../Responsible AI/Image Explainers.md | 227 ++++ .../Interpreting Model Predictions.md | 174 +++ .../Responsible AI/PDP and ICE Explainers.md | 519 +++++++++ .../Quickstart - Data Balance Analysis.md | 331 ++++++ .../Quickstart - Snow Leopard Detection.md | 260 +++++ .../Responsible AI/Tabular Explainers.md | 169 +++ .../Responsible AI/Text Explainers.md | 138 +++ .../Vowpal Wabbit/Contextual Bandits.md | 118 ++ .../Multi-class classification.md | 99 ++ .../Vowpal Wabbit/Overview.md | 112 ++ ... - Classification using SparkML Vectors.md | 99 ++ ...- Classification using VW-native Format.md | 107 ++ ...on, Quantile Regression, and Regression.md | 568 +++++++++ .../Get Started/Create a Spark Cluster.md | 32 + .../Get Started/Install SynapseML.md | 186 +++ .../Quickstart - Your First Models.md | 70 ++ .../Get Started/Set up Cognitive Services.md | 129 ++ .../versioned_docs/version-1.0.7/Overview.md | 29 + .../Quick Examples/estimators/_LightGBM.md | 164 +++ .../Quick Examples/estimators/_VW.md | 112 ++ .../estimators/causal/_causalInferenceDML.md | 100 ++ .../estimators/cognitive/_MAD.md | 97 ++ .../Quick Examples/estimators/core/_AutoML.md | 214 ++++ .../estimators/core/_Featurize.md | 332 ++++++ .../estimators/core/_IsolationForest.md | 65 ++ .../Quick Examples/estimators/core/_NN.md | 92 ++ .../estimators/core/_Recommendation.md | 379 ++++++ .../Quick Examples/estimators/core/_Stages.md | 219 ++++ .../Quick Examples/estimators/core/_Train.md | 171 +++ .../estimators/estimators_causal.md | 13 + .../estimators/estimators_cognitive.md | 12 + .../estimators/estimators_core.md | 42 + .../estimators/estimators_lightgbm.md | 13 + .../estimators/estimators_vw.md | 13 + .../Quick Examples/transformers/_OpenCV.md | 121 ++ .../Quick Examples/transformers/_VW.md | 297 +++++ .../cognitive/_AnomalyDetection.md | 319 +++++ .../transformers/cognitive/_AzureSearch.md | 153 +++ .../cognitive/_BingImageSearch.md | 95 ++ .../transformers/cognitive/_ComputerVision.md | 547 +++++++++ .../transformers/cognitive/_Face.md | 414 +++++++ .../transformers/cognitive/_FormRecognizer.md | 616 ++++++++++ .../transformers/cognitive/_SpeechToText.md | 144 +++ .../transformers/cognitive/_TextAnalytics.md | 416 +++++++ .../transformers/cognitive/_Translator.md | 492 ++++++++ .../transformers/core/_Explainers.md | 574 +++++++++ .../transformers/core/_Featurize.md | 282 +++++ .../Quick Examples/transformers/core/_IO.md | 341 ++++++ .../transformers/core/_Image.md | 150 +++ .../transformers/core/_Stages.md | 1038 +++++++++++++++++ .../transformers/core/_Train.md | 151 +++ .../transformers/deep_learning/_ONNXModel.md | 46 + .../transformers/transformers_cognitive.md | 55 + .../transformers/transformers_core.md | 38 + .../transformers_deep_learning.md | 10 + .../transformers/transformers_opencv.md | 13 + .../transformers/transformers_vw.md | 13 + .../Reference/Contributor Guide.md | 89 ++ .../Reference/Developer Setup.md | 141 +++ .../version-1.0.7/Reference/Docker Setup.md | 292 +++++ .../version-1.0.7/Reference/Dotnet Setup.md | 247 ++++ .../Quickstart - LightGBM in Dotnet.md | 126 ++ .../version-1.0.7/Reference/R Setup.md | 150 +++ .../Use with MLFlow/Autologging.md | 85 ++ .../version-1.0.7/Use with MLFlow/Install.md | 4 + .../version-1.0.7/Use with MLFlow/Overview.md | 199 ++++ .../version-1.0.7-sidebars.json | 190 +++ website/versions.json | 1 + 117 files changed, 23177 insertions(+) create mode 100644 website/versioned_docs/version-1.0.7/Deploy Models/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Deploy Models/Quickstart - Deploying a Classifier.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Geospatial Services.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Multivariate Anomaly Detection.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Text.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Flooding Risk.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Synthetic difference in differences.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Train Classifier.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Distributed Training.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Getting Started.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/ONNX.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/HyperOpt.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Langchain.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/OpenAI.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/OpenCV/Image Transformations.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Cyber ML.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Regression/Quickstart - Data Cleaning.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Regression/Quickstart - Train Regressor.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Regression/Quickstart - Vowpal Wabbit and LightGBM.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Data Balance Analysis.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Explanation Dashboard.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Image Explainers.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Interpreting Model Predictions.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/PDP and ICE Explainers.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Quickstart - Data Balance Analysis.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Quickstart - Snow Leopard Detection.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Tabular Explainers.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Responsible AI/Text Explainers.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Contextual Bandits.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Multi-class classification.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using SparkML Vectors.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification using VW-native Format.md create mode 100644 website/versioned_docs/version-1.0.7/Explore Algorithms/Vowpal Wabbit/Quickstart - Classification, Quantile Regression, and Regression.md create mode 100644 website/versioned_docs/version-1.0.7/Get Started/Create a Spark Cluster.md create mode 100644 website/versioned_docs/version-1.0.7/Get Started/Install SynapseML.md create mode 100644 website/versioned_docs/version-1.0.7/Get Started/Quickstart - Your First Models.md create mode 100644 website/versioned_docs/version-1.0.7/Get Started/Set up Cognitive Services.md create mode 100644 website/versioned_docs/version-1.0.7/Overview.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/_LightGBM.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/_VW.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/causal/_causalInferenceDML.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/cognitive/_MAD.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_AutoML.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_Featurize.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_IsolationForest.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_NN.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_Recommendation.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_Stages.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/core/_Train.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/estimators_causal.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/estimators_cognitive.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/estimators_core.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/estimators_lightgbm.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/estimators/estimators_vw.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/_OpenCV.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/_VW.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_AnomalyDetection.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_AzureSearch.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_BingImageSearch.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_ComputerVision.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_Face.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_FormRecognizer.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_SpeechToText.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_TextAnalytics.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/cognitive/_Translator.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_Explainers.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_Featurize.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_IO.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_Image.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_Stages.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/core/_Train.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/deep_learning/_ONNXModel.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/transformers_cognitive.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/transformers_core.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/transformers_deep_learning.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/transformers_opencv.md create mode 100644 website/versioned_docs/version-1.0.7/Quick Examples/transformers/transformers_vw.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/Contributor Guide.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/Developer Setup.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/Docker Setup.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/Dotnet Setup.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/Quickstart - LightGBM in Dotnet.md create mode 100644 website/versioned_docs/version-1.0.7/Reference/R Setup.md create mode 100644 website/versioned_docs/version-1.0.7/Use with MLFlow/Autologging.md create mode 100644 website/versioned_docs/version-1.0.7/Use with MLFlow/Install.md create mode 100644 website/versioned_docs/version-1.0.7/Use with MLFlow/Overview.md create mode 100644 website/versioned_sidebars/version-1.0.7-sidebars.json diff --git a/website/versioned_docs/version-1.0.7/Deploy Models/Overview.md b/website/versioned_docs/version-1.0.7/Deploy Models/Overview.md new file mode 100644 index 0000000000..4d0f54ea18 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Deploy Models/Overview.md @@ -0,0 +1,228 @@ +--- +title: Spark Serving +hide_title: true +sidebar_label: About +--- + + + +# Spark Serving + +### An Engine for Deploying Spark Jobs as Distributed Web Services + +- **Distributed**: Takes full advantage of Node, JVM, and thread level + parallelism that Spark is famous for. +- **Fast**: No single node bottlenecks, no round trips to Python. + Requests can be routed directly to and from worker JVMs through + network switches. Spin up a web service in a matter of seconds. +- **Low Latency**: When using continuous serving, + you can achieve latencies as low as 1 millisecond. +- **Deployable Anywhere**: Works anywhere that runs Spark such as + Databricks, HDInsight, AZTK, DSVMs, local, or on your own + cluster. Usable from Spark, PySpark, and SparklyR. +- **Lightweight**: No dependence on costly Kafka or + Kubernetes clusters. +- **Idiomatic**: Uses the same API as batch and structured streaming. +- **Flexible**: Spin up and manage several services on a single Spark + cluster. Synchronous and Asynchronous service management and + extensibility. Deploy any spark job that is expressible as a + structured streaming query. Use serving sources/sinks with other + Spark data sources/sinks for more complex deployments. + +## Usage + +### Jupyter Notebook Examples + +- [Deploy a classifier trained on the Adult Census Dataset](../Quickstart%20-%20Deploying%20a%20Classifier) +- More coming soon! + +### Spark Serving Hello World + +```python +import synapse.ml +import pyspark +from pyspark.sql.functions import udf, col, length +from pyspark.sql.types import * + +df = spark.readStream.server() \ + .address("localhost", 8888, "my_api") \ + .load() \ + .parseRequest(StructType().add("foo", StringType()).add("bar", IntegerType())) + +replies = df.withColumn("fooLength", length(col("foo")))\ + .makeReply("fooLength") + +server = replies\ + .writeStream \ + .server() \ + .replyTo("my_api") \ + .queryName("my_query") \ + .option("checkpointLocation", "file:///path/to/checkpoints") \ + .start() +``` + +### Deploying a Deep Network with the CNTKModel + +```python +import synapse.ml +from synapse.ml.cntk import CNTKModel +import pyspark +from pyspark.sql.functions import udf, col + +df = spark.readStream.server() \ + .address("localhost", 8888, "my_api") + .load() + .parseRequest() + +# See notebook examples for how to create and save several +# examples of CNTK models +network = CNTKModel.load("file:///path/to/my_cntkmodel.mml") + +transformed_df = network.transform(df).makeReply() + +server = transformed_df \ + .writeStream \ + .server() \ + .replyTo("my_api") \ + .queryName("my_query") \ + .option("checkpointLocation", "file:///path/to/checkpoints") \ + .start() +``` + +## Architecture + +Spark Serving adds special streaming sources and sinks to turn any +structured streaming job into a web service. Spark Serving comes +with two deployment options that vary based on what form of load balancing +is being used. + +In brief you can use: +`spark.readStream.server()`: For head node load balanced services +`spark.readStream.distributedServer()`: For custom load balanced services +`spark.readStream.continuousServer()`: For a custom load balanced, submillisecond-latency continuous server + +to create the various different serving dataframes and use the equivalent statements after `df.writeStream` +for replying to the web requests. + +### Head Node Load Balanced + +You can deploy head node load balancing with the `HTTPSource` and +`HTTPSink` classes. This mode spins up a queue on the head node, +distributes work across partitions, then collects response data back to +the head node. All HTTP requests are kept and replied to on the head +node. In both python and Scala these classes can be access by using +`spark.readStream.server()` after importing SynapseML. +This mode allows for more complex windowing, repartitioning, and +SQL operations. This option is also idea for rapid setup and testing, +as it doesn't require any further load balancing or network +switches. A diagram of this configuration can be seen in this image: + +

+ +

+ +### Fully Distributed (Custom Load Balancer) + +You can configure Spark Serving for a custom load balancer using the +`DistributedHTTPSource` and `DistributedHTTPSink` classes. This mode +spins up servers on each executor JVM. +In both python and Scala these classes can be access by using +`spark.readStream.distributedServer()` after importing SynapseML. +Each server will feed its +executor's partitions in parallel. This mode is key for high throughput +and low latency as data doesn't need to be transferred to and from the +head node. This deployment results in several web services that all +route into the same spark computation. You can deploy an external load +balancer to unify the executor's services under a single IP address. +Support for automatic load balancer management and deployment is +targeted for the next release of SynapseML. A diagram of this +configuration can be seen here: + +

+ +

+ +Queries that involve data movement across workers, such as a nontrivial +SQL join, need special consideration. The user must ensure that the +right machine replies to each request. One can route data back to the +originating partition with a broadcast join. In the future, request +routing will be automatically handled by the sink. + +### Sub-Millisecond Latency with Continuous Processing + +

+ +

+ +Continuous processing can be enabled by hooking into the `HTTPSourceV2` class using: + + spark.readStream.continuousServer() + ... + +In continuous serving, much like continuous streaming you need to add a trigger to your write statement: + + df.writeStream + .continuousServer() + .trigger(continuous="1 second") + ... + +The architecture is similar to the custom load balancer setup described earlier. +More specifically, Spark will manage a web service on each partition. +These webservices can be unified together using an Azure Load Balancer, +Kubernetes Service Endpoint, Azure Application gateway or any other way to load balance a distributed service. +It's currently the user's responsibility to optionally unify these services as they see fit. +In the future, we'll include options to dynamically spin up and manage a load balancer. + +#### Databricks Setup + +Databricks is a managed architecture and they've restricted +all incoming traffic to the nodes of the cluster. +If you create a web service in your databricks cluster (head or worker nodes), +your cluster can communicate with the service, but the outside world can't. +However, in the future, Databricks will support Virtual Network Injection, so problem will not arise. +In the meantime, you must use SSH tunneling to forward the services to another machine(s) +to act as a networking gateway. This machine can be any machine that accepts SSH traffic and requests. +We have included settings to automatically configure this SSH tunneling for convenience. + +##### Linux Gateway Setup - Azure + +1. [Create a Linux VM using SSH](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal) +2. [Open ports 8000-9999 from the Azure portal](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal) +3. Open the port on the firewall on the VM + ```$xslt + firewall-cmd --zone=public --add-port=8000-10000/tcp --permanent + firewall-cmd --reload + echo "GatewayPorts yes" >> /etc/ssh/sshd_config + service ssh --full-restart + ``` +4. Add your private key to a private container in [Azure Storage Blob](https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&tabs=portal). +5. Generate a SAS link for your key and save it. +6. Include the following parameters on your reader to configure the SSH tunneling: + serving_inputs = (spark.readStream.continuousServer() + .option("numPartitions", 1) + .option("forwarding.enabled", True) # enable ssh forwarding to a gateway machine + .option("forwarding.username", "username") + .option("forwarding.sshHost", "ip or dns") + .option("forwarding.keySas", "SAS url from the previous step") + .address("localhost", 8904, "my_api") + .load() + +This setup will make your service require an extra jump and affect latency. +It's important to pick a gateway that has good connectivity to your spark cluster. +For best performance and ease of configuration, we suggest using Spark Serving +on an open cluster environment such as Kubernetes, Mesos, or Azure Batch. + + +## Parameters + +| Parameter Name | Description | Necessary | Default Value | Applicable When | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------------- | ----------------------------------------------------------------------------------------------------- | +| host | The host to spin up a server on | Yes | | | +| port | The starting port when creating the web services. Web services will increment this port several times to find an open port. In the future, the flexibility of this param will be expanded | yes | | | +| name | The Path of the api a user would call. The format is `hostname:port/name` | yes | | | +| forwarding.enabled | Whether to forward the services to a gateway machine | no | false | When you need to forward services out of a protected network. Only Supported for Continuous Serving. | +| forwarding.username | the username to connect to on the remote host | no | | | +| forwarding.sshport | the port to ssh connect to | no | 22 | | +| forwarding.sshHost | the host of the gateway machine | no | | | +| forwarding.keySas | A Secure access link that can be used to automatically download the required ssh private key | no | | Sometimes more convenient than a directory | +| forwarding.keyDir | A directory on the machines holding the private key | no | "~/.ssh" | Useful if you can't send keys over the wire securely | diff --git a/website/versioned_docs/version-1.0.7/Deploy Models/Quickstart - Deploying a Classifier.md b/website/versioned_docs/version-1.0.7/Deploy Models/Quickstart - Deploying a Classifier.md new file mode 100644 index 0000000000..8b76a1fb1a --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Deploy Models/Quickstart - Deploying a Classifier.md @@ -0,0 +1,114 @@ +--- +title: Quickstart - Deploying a Classifier +hide_title: true +status: stable +--- +## Model Deployment with Spark Serving +In this example, we try to predict incomes from the *Adult Census* dataset. Then we will use Spark serving to deploy it as a realtime web service. +First, we import needed packages: + +Now let's read the data and split it to train and test sets: + + +```python +data = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet" +) +data = data.select(["education", "marital-status", "hours-per-week", "income"]) +train, test = data.randomSplit([0.75, 0.25], seed=123) +train.limit(10).toPandas() +``` + +`TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers. +You can use `help(synapse.ml.TrainClassifier)` to view the different parameters. + +Note that it implicitly converts the data into the format expected by the algorithm. More specifically it: + tokenizes, hashes strings, one-hot encodes categorical variables, assembles the features into a vector +etc. The parameter `numFeatures` controls the number of hashed features. + + +```python +from synapse.ml.train import TrainClassifier +from pyspark.ml.classification import LogisticRegression + +model = TrainClassifier( + model=LogisticRegression(), labelCol="income", numFeatures=256 +).fit(train) +``` + +After the model is trained, we score it against the test dataset and view metrics. + + +```python +from synapse.ml.train import ComputeModelStatistics, TrainedClassifierModel + +prediction = model.transform(test) +prediction.printSchema() +``` + + +```python +metrics = ComputeModelStatistics().transform(prediction) +metrics.limit(10).toPandas() +``` + +First, we will define the webservice input/output. +For more information, you can visit the [documentation for Spark Serving](https://github.com/Microsoft/SynapseML/blob/master/docs/mmlspark-serving.md) + + +```python +from pyspark.sql.types import * +from synapse.ml.io import * +import uuid + +serving_inputs = ( + spark.readStream.server() + .address("localhost", 8898, "my_api") + .option("name", "my_api") + .load() + .parseRequest("my_api", test.schema) +) + +serving_outputs = model.transform(serving_inputs).makeReply("prediction") + +server = ( + serving_outputs.writeStream.server() + .replyTo("my_api") + .queryName("my_query") + .option("checkpointLocation", "file:///tmp/checkpoints-{}".format(uuid.uuid1())) + .start() +) +``` + +Test the webservice + + +```python +import requests + +data = '{"education":" 10th","marital-status":"Divorced","hours-per-week":40.0}' +r = requests.post(data=data, url="http://localhost:8898/my_api") +print("Response {}".format(r.text)) +``` + + +```python +import requests + +data = '{"education":" Masters","marital-status":"Married-civ-spouse","hours-per-week":40.0}' +r = requests.post(data=data, url="http://localhost:8898/my_api") +print("Response {}".format(r.text)) +``` + + +```python +import time + +time.sleep(20) # wait for server to finish setting up (just to be safe) +server.stop() +``` + + +```python + +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.md new file mode 100644 index 0000000000..18617439f4 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Advanced Usage - Async, Batching, and Multi-Key.md @@ -0,0 +1,149 @@ +--- +title: Advanced Usage - Async, Batching, and Multi-Key +hide_title: true +status: stable +--- +# AI Services Advanced Guide: Asynchrony, Batching, Multi-Key + +## Step 1: Imports and Keys + + +``` +from synapse.ml.core.platform import find_secret + +service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) +service_loc = "eastus" +``` + +## Step 2: Basic Usage + +Image 1 | Image 2 | Image 3 +:-------------------------:|:-------------------------:|:----------------------:| +! | | + + +``` +from synapse.ml.services.vision import AnalyzeImage + +# Create a dataframe with the image URLs +base_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/" +image_df = spark.createDataFrame( + [(base_url + "objects.jpg",), (base_url + "dog.jpg",), (base_url + "house.jpg",)], + ["image"], +) + +# Run the Computer Vision service. Analyze Image extracts infortmation from/about the images. +analyzer = ( + AnalyzeImage() + .setLocation(service_loc) + .setSubscriptionKey(service_key) + .setVisualFeatures( + ["Categories", "Color", "Description", "Faces", "Objects", "Tags"] + ) + .setOutputCol("analysis_results") + .setImageUrlCol("image") + .setErrorCol("error") +) + +image_results = analyzer.transform(image_df).cache() +``` + +#### First we'll look at the full response objects: + + +``` +display(image_results) +``` + +#### We can select out just what we need: + + +``` +display(image_results.select("analysis_results.description.captions.text")) +``` + +#### What's going on under the hood + + + +When we call the AI service transformer, we start cognitive service clients on each of your spark workers. +These clients send requests to the cloud, and turn the JSON responses into Spark Struct Types so that you can access any field that the service returns. + +## Step 3: Asynchronous Usage + + + +Apache Spark ordinarily parallelizes a computation to all of it's worker threads. When working with services however this parallelism doesent fully maximize throughput because workers sit idle as requests are processed on the server. The `concurrency` parameter makes sure that each worker can stay busy as they wait for requests to complete. + + +``` +display(analyzer.setConcurrency(3).transform(image_df)) +``` + +#### Faster without extra hardware: + + +## Step 4: Batching + + +``` +from synapse.ml.services.language import AnalyzeText + +# Create a dataframe +text_df = spark.createDataFrame( + [ + ("I am so happy today, its sunny!",), + ("I am frustrated by this rush hour traffic",), + ("The AI services on spark is pretty lit",), + ], + ["text"], +) + +sentiment = ( + AnalyzeText() + .setKind("SentimentAnalysis") + .setTextCol("text") + .setLocation(service_loc) + .setSubscriptionKey(service_key) + .setOutputCol("sentiment") + .setErrorCol("error") + .setBatchSize(10) +) + +# Show the results of your text query +display(sentiment.transform(text_df).select("text", "sentiment.documents.sentiment")) +``` + +## Step 5: Multi-Key + + +``` +from pyspark.sql.functions import udf +import random + +service_key_2 = find_secret( + secret_name="ai-services-api-key-2", keyvault="mmlspark-build-keys" +) +keys = [service_key, service_key_2] + + +@udf +def random_key(): + return keys[random.randint(0, len(keys) - 1)] + + +image_df2 = image_df.withColumn("key", random_key()) + +results = analyzer.setSubscriptionKeyCol("key").transform(image_df2) +``` + + +``` +display(results.select("key", "analysis_results.description.captions.text")) +``` + +## Learn More +- [Explore other cogntive services](../Overview) +- [Read our paper "Large-Scale Intelligent Microservices"](https://arxiv.org/abs/2009.08044) diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Geospatial Services.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Geospatial Services.md new file mode 100644 index 0000000000..b7550c86d5 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Geospatial Services.md @@ -0,0 +1,276 @@ +--- +title: Geospatial Services +hide_title: true +status: stable +--- + + +# Azure Maps Geospatial Services + +[Microsoft Azure Maps ](https://azure.microsoft.com/services/azure-maps/) provides developers from all industries with powerful geospatial capabilities. Those geospatial capabilities are packed with the freshest mapping data. Azure Maps is available for web, mobile (iOS and Android), Microsoft Power BI, Microsoft Power Apps and Microsoft Synapse. Azure Maps is an Open API compliant set of REST APIs. The following are only a high-level overview of the services which Azure Maps offers - Maps, Search, Routing, Traffic, Weather, Time Zones, Geolocation, Geofencing, Map Data, Creator, and Spatial Operations. + +## Usage + +### Geocode addresses +[**Address Geocoding**](https://docs.microsoft.com/rest/api/maps/search/post-search-address-batch) The Search Address Batch API sends batches of queries to Search Address API using just a single API call. This API geocodes text addresses or partial addresses and the geocoding search index will be queried for everything above the street level data. **Note** that the geocoder is very tolerant of typos and incomplete addresses. It will also handle everything from exact street addresses or street or intersections as well as higher level geographies such as city centers, counties, states etc. + +### Reverse Geocode Coordinates +[**Reverse Geocoding**](https://docs.microsoft.com/rest/api/maps/search/post-search-address-reverse-batch) The Search Address Reverse Batch API sends batches of queries to Search Address Reverse API using just a single API call. This API takes in location coordinates and translates them into human readable street addresses. Most often this is needed in tracking applications where you receive a GPS feed from the device or asset and wish to know what address where the coordinate is located. + +### Get Point In Polygon +[**Get Point in Polygon**](https://docs.microsoft.com/rest/api/maps/spatial/get-point-in-polygon) This API returns a boolean value indicating whether a point is inside a set of polygons. The set of polygons can we pre-created by using the [**Data Upload API**](https://docs.microsoft.com/rest/api/maps/data/upload-preview) referenced by a unique udid. + +## Prerequisites + +1. Sign into the [Azure Portal](https://portal.azure.com) and create an Azure Maps account by following these [instructions](https://docs.microsoft.com/azure/azure-maps/how-to-manage-account-keys#create-a-new-account). +1. Once the Maps account is created, provision a Maps Creator Resource by following these [instructions](https://docs.microsoft.com/azure/azure-maps/how-to-manage-creator#create-creator-resource). Creator is a [geographically scoped service](https://docs.microsoft.com/azure/azure-maps/creator-geographic-scope). Pick appropriate location while provisioning the creator resource. +1. Follow these [instructions](https://learn.microsoft.com/azure/synapse-analytics/machine-learning/overview-cognitive-services#create-an-apache-spark-cluster) to set up your Azure Databricks environment and install SynapseML. +1. After you create a new notebook in Azure Databricks, copy the **Shared code** below and paste into a new cell in your notebook. +1. Choose a service sample, below, and copy paste it into a second new cell in your notebook. +1. Replace the `AZUREMAPS_API_KEY` placeholders with your own [Maps account key](https://docs.microsoft.com/azure/azure-maps/how-to-manage-authentication#view-authentication-details). +1. Choose the run button (triangle icon) in the upper right corner of the cell, then select **Run Cell**. +1. View results in a table below the cell. + +## Shared code + +To get started, we'll need to add this code to the project: + + +```python +from pyspark.sql.types import StructType, StructField, DoubleType +from pyspark.sql.functions import col +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +# Configure more resiliant requests to stop flakiness +retry_strategy = Retry( + total=3, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "TRACE"], +) +adapter = HTTPAdapter(max_retries=retry_strategy) +http = requests.Session() +http.mount("https://", adapter) +http.mount("http://", adapter) +``` + + +```python +from synapse.ml.services import * +from synapse.ml.services.geospatial import * +from synapse.ml.core.platform import * + +# An Azure Maps account key +maps_key = find_secret(secret_name="azuremaps-api-key", keyvault="mmlspark-build-keys") +``` + +## Geocoding sample + +The azure maps geocoder sends batches of queries to the [Search Address API](https://docs.microsoft.com/rest/api/maps/search/getsearchaddress). The API limits the batch size to 10000 queries per request. + + +```python +from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch + +df = spark.createDataFrame( + [ + ("One, Microsoft Way, Redmond",), + ("400 Broad St, Seattle",), + ("350 5th Ave, New York",), + ("Pike Pl, Seattle",), + ("Champ de Mars, 5 Avenue Anatole France, 75007 Paris",), + ], + [ + "address", + ], +) + + +def extract_location_fields(df): + # Use this function to select only lat/lon columns into the dataframe + return df.select( + col("*"), + col("output.response.results") + .getItem(0) + .getField("position") + .getField("lat") + .alias("Latitude"), + col("output.response.results") + .getItem(0) + .getField("position") + .getField("lon") + .alias("Longitude"), + ).drop("output") + + +# Run the Azure Maps geocoder to enhance the data with location data +geocoder = ( + AddressGeocoder() + .setSubscriptionKey(maps_key) + .setAddressCol("address") + .setOutputCol("output") +) + +# Show the results of your text query in a table format +display( + extract_location_fields( + geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df)) + ) +) +``` + +## Reverse Geocoding sample + +The azure maps reverse geocoder sends batches of queries to the [Search Address Reverse API](https://docs.microsoft.com/rest/api/maps/search/get-search-address-reverse) using just a single API call. The API allows caller to batch up to 10,000 queries per request + + +```python +# Create a dataframe that's tied to it's column names +df = spark.createDataFrame( + ( + ( + (48.858561, 2.294911), + (47.639765, -122.127896), + (47.621028, -122.348170), + (47.734012, -122.102737), + ) + ), + StructType([StructField("lat", DoubleType()), StructField("lon", DoubleType())]), +) + +# Run the Azure Maps geocoder to enhance the data with location data +rev_geocoder = ( + ReverseAddressGeocoder() + .setSubscriptionKey(maps_key) + .setLatitudeCol("lat") + .setLongitudeCol("lon") + .setOutputCol("output") +) + +# Show the results of your text query in a table format + +display( + rev_geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df)) + .select( + col("*"), + col("output.response.addresses") + .getItem(0) + .getField("address") + .getField("freeformAddress") + .alias("In Polygon"), + col("output.response.addresses") + .getItem(0) + .getField("address") + .getField("country") + .alias("Intersecting Polygons"), + ) + .drop("output") +) +``` + +## Check Point In Polygon sample + +This API returns a boolean value indicating whether a point is inside a set of polygons. The polygon can be added to your creator account using the [**Data Upload API**](https://docs.microsoft.com/rest/api/maps/data/upload-preview). The API then returns a unique udid to reference the polygon. + +### Set up geojson Polygons in your azure maps creator account + +Based on where the creator resource was provisioned, we need to prefix the appropriate geography code to the azure maps URL. In this example, the assumption is that the creator resource was provisioned in `East US 2` Location and hence we pick `us` as our geo prefix. + + +```python +import time +import json + +# Choose a geography, you want your data to reside in. +# Allowed values +# us => North American datacenters +# eu -> European datacenters +url_geo_prefix = "us" + +# Upload a geojson with polygons in them +r = http.post( + f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={maps_key}", + json={ + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": {"geometryId": "test_geometry"}, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [-122.14290618896484, 47.67856488312544], + [-122.03956604003906, 47.67856488312544], + [-122.03956604003906, 47.7483271435476], + [-122.14290618896484, 47.7483271435476], + [-122.14290618896484, 47.67856488312544], + ] + ], + }, + } + ], + }, +) + +long_running_operation = r.headers.get("location") +time.sleep(30) # Sometimes this may take upto 30 seconds +print(f"Status Code: {r.status_code}, Long Running Operation: {long_running_operation}") +# This Operation completes in approximately 5 ~ 15 seconds +user_data_id_resource_url = json.loads( + http.get(f"{long_running_operation}&subscription-key={maps_key}").content +)["resourceLocation"] +user_data_id = json.loads( + http.get(f"{user_data_id_resource_url}&subscription-key={maps_key}").content +)["udid"] +``` + +### Use the function to check if point is in polygon + + +```python +# Create a dataframe that's tied to it's column names +df = spark.createDataFrame( + ( + ( + (48.858561, 2.294911), + (47.639765, -122.127896), + (47.621028, -122.348170), + (47.734012, -122.102737), + ) + ), + StructType([StructField("lat", DoubleType()), StructField("lon", DoubleType())]), +) + +# Run the Azure Maps geocoder to enhance the data with location data +check_point_in_polygon = ( + CheckPointInPolygon() + .setSubscriptionKey(maps_key) + .setGeography(url_geo_prefix) + .setUserDataIdentifier(user_data_id) + .setLatitudeCol("lat") + .setLongitudeCol("lon") + .setOutputCol("output") +) + +# Show the results of your text query in a table format +display( + check_point_in_polygon.transform(df) + .select( + col("*"), + col("output.result.pointInPolygons").alias("In Polygon"), + col("output.result.intersectingGeometries").alias("Intersecting Polygons"), + ) + .drop("output") +) +``` + +### Cleanup + + +```python +res = http.delete( + f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={maps_key}" +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Multivariate Anomaly Detection.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Multivariate Anomaly Detection.md new file mode 100644 index 0000000000..f47f840aed --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Multivariate Anomaly Detection.md @@ -0,0 +1,321 @@ +--- +title: Multivariate Anomaly Detection +hide_title: true +status: stable +--- +# Recipe: Azure AI Services - Multivariate Anomaly Detection +This recipe shows how you can use SynapseML and Azure AI services on Apache Spark for multivariate anomaly detection. Multivariate anomaly detection allows for the detection of anomalies among many variables or time series, taking into account all the inter-correlations and dependencies between the different variables. In this scenario, we use SynapseML to train a model for multivariate anomaly detection using the Azure AI services, and we then use to the model to infer multivariate anomalies within a dataset containing synthetic measurements from three IoT sensors. + +To learn more about the Azure AI Anomaly Detector, refer to [this documentation page](https://docs.microsoft.com/azure/ai-services/anomaly-detector/). + +## Important +Starting on the 20th of September, 2023 you won’t be able to create new Anomaly Detector resources. The Anomaly Detector service is being retired on the 1st of October, 2026. + + +## Setup +### Create an Anomaly Detector resource +Follow the instructions to create an `Anomaly Detector` resource using the Azure portal or alternatively, you can also use the Azure CLI to create this resource. + +- In the Azure portal, select **Create** in your resource group, and then type **Anomaly Detector**. Select the Anomaly Detector resource. +- Give the resource a name, and ideally use the same region as the rest of your resource group. Use the default options for the rest, and then select **Review + Create** and then **Create**. +- Once the Anomaly Detector resource is created, open it and select the `Keys and Endpoints` panel in the left nav. Copy the key for the Anomaly Detector resource into the `ANOMALY_API_KEY` environment variable, or store it in the `anomalyKey` variable. + +### Create a Storage Account resource +In order to save intermediate data, you need to create an Azure Blob Storage Account. Within that storage account, create a container for storing the intermediate data. Make note of the container name, and copy the connection string to that container. You need it later to populate the `containerName` variable and the `BLOB_CONNECTION_STRING` environment variable. + +### Enter your service keys +Let's start by setting up the environment variables for our service keys. The next cell sets the `ANOMALY_API_KEY` and the `BLOB_CONNECTION_STRING` environment variables based on the values stored in our Azure Key Vault. If you're running this tutorial in your own environment, make sure you set these environment variables before you proceed. + +Now, lets read the `ANOMALY_API_KEY` and `BLOB_CONNECTION_STRING` environment variables and set the `containerName` and `location` variables. + + +```python +from synapse.ml.core.platform import find_secret + +# An Anomaly Dectector subscription key +anomalyKey = find_secret( + secret_name="anomaly-api-key", keyvault="mmlspark-build-keys" +) # use your own anomaly api key +# Your storage account name +storageName = "anomalydetectiontest" # use your own storage account name +# A connection string to your blob storage account +storageKey = find_secret( + secret_name="madtest-storage-key", keyvault="mmlspark-build-keys" +) # use your own storage key +# A place to save intermediate MVAD results +intermediateSaveDir = ( + "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData" +) +# The location of the anomaly detector resource that you created +location = "westus2" +``` + +First we connect to our storage account so that anomaly detector can save intermediate results there: + + +```python +spark.sparkContext._jsc.hadoopConfiguration().set( + f"fs.azure.account.key.{storageName}.blob.core.windows.net", storageKey +) +``` + +Let's import all the necessary modules. + + +```python +import numpy as np +import pandas as pd + +import pyspark +from pyspark.sql.functions import col +from pyspark.sql.functions import lit +from pyspark.sql.types import DoubleType +import matplotlib.pyplot as plt + +import synapse.ml +from synapse.ml.services.anomaly import * +``` + +Now, let's read our sample data into a Spark DataFrame. + + +```python +df = ( + spark.read.format("csv") + .option("header", "true") + .load("wasbs://publicwasb@mmlspark.blob.core.windows.net/MVAD/sample.csv") +) + +df = ( + df.withColumn("sensor_1", col("sensor_1").cast(DoubleType())) + .withColumn("sensor_2", col("sensor_2").cast(DoubleType())) + .withColumn("sensor_3", col("sensor_3").cast(DoubleType())) +) + +# Let's inspect the dataframe: +df.show(5) +``` + +We can now create an `estimator` object, which is used to train our model. We specify the start and end times for the training data. We also specify the input columns to use, and the name of the column that contains the timestamps. Finally, we specify the number of data points to use in the anomaly detection sliding window, and we set the connection string to the Azure Blob Storage Account. + + +```python +trainingStartTime = "2020-06-01T12:00:00Z" +trainingEndTime = "2020-07-02T17:55:00Z" +timestampColumn = "timestamp" +inputColumns = ["sensor_1", "sensor_2", "sensor_3"] + +estimator = ( + SimpleFitMultivariateAnomaly() + .setSubscriptionKey(anomalyKey) + .setLocation(location) + .setStartTime(trainingStartTime) + .setEndTime(trainingEndTime) + .setIntermediateSaveDir(intermediateSaveDir) + .setTimestampCol(timestampColumn) + .setInputCols(inputColumns) + .setSlidingWindow(200) +) +``` + +Now that we created the `estimator`, let's fit it to the data: + + +```python +model = estimator.fit(df) +``` + +Once the training is done, we can now use the model for inference. The code in the next cell specifies the start and end times for the data we would like to detect the anomalies in. + + +```python +inferenceStartTime = "2020-07-02T18:00:00Z" +inferenceEndTime = "2020-07-06T05:15:00Z" + +result = ( + model.setStartTime(inferenceStartTime) + .setEndTime(inferenceEndTime) + .setOutputCol("results") + .setErrorCol("errors") + .setInputCols(inputColumns) + .setTimestampCol(timestampColumn) + .transform(df) +) + +result.show(5) +``` + +When we called `.show(5)` in the previous cell, it showed us the first five rows in the dataframe. The results were all `null` because they weren't inside the inference window. + +To show the results only for the inferred data, lets select the columns we need. We can then order the rows in the dataframe by ascending order, and filter the result to only show the rows that are in the range of the inference window. In our case `inferenceEndTime` is the same as the last row in the dataframe, so can ignore that. + +Finally, to be able to better plot the results, lets convert the Spark dataframe to a Pandas dataframe. + + + +```python +rdf = ( + result.select( + "timestamp", + *inputColumns, + "results.interpretation", + "isAnomaly", + "results.severity" + ) + .orderBy("timestamp", ascending=True) + .filter(col("timestamp") >= lit(inferenceStartTime)) + .toPandas() +) + +rdf +``` + +Format the `contributors` column that stores the contribution score from each sensor to the detected anomalies. The next cell formats this data, and splits the contribution score of each sensor into its own column. + +For Spark3.3 and below versions, the output of select statements will be in the format of `List`, so to format the data into dictionary and generate the values when interpretation is empty, please use the below parse method: + +``` +def parse(x): + if len(x) > 0: + return dict([item[:2] for item in x]) + else: + return {"sensor_1": 0, "sensor_2": 0, "sensor_3": 0} +``` + +Staring with Spark3.4, the output of the select statement is already formatted as a `numpy.ndarry` and no need to format the data again, so please use below parse method to generate the values when interpretation is empty: + + + +```python +def parse(x): + if len(x) == 0: + return {"sensor_1": 0, "sensor_2": 0, "sensor_3": 0} + + +rdf["contributors"] = rdf["interpretation"].apply(parse) +rdf = pd.concat( + [ + rdf.drop(["contributors"], axis=1), + pd.json_normalize(rdf["contributors"]).rename( + columns={ + "sensor_1": "series_1", + "sensor_2": "series_2", + "sensor_3": "series_3", + } + ), + ], + axis=1, +) +rdf +``` + +Great! We now have the contribution scores of sensors 1, 2, and 3 in the `series_0`, `series_1`, and `series_2` columns respectively. + +Run the next cell to plot the results. The `minSeverity` parameter specifies the minimum severity of the anomalies to be plotted. + + +```python +minSeverity = 0.1 + + +####### Main Figure ####### +plt.figure(figsize=(23, 8)) +plt.plot( + rdf["timestamp"], + rdf["sensor_1"], + color="tab:orange", + line, + linewidth=2, + label="sensor_1", +) +plt.plot( + rdf["timestamp"], + rdf["sensor_2"], + color="tab:green", + line, + linewidth=2, + label="sensor_2", +) +plt.plot( + rdf["timestamp"], + rdf["sensor_3"], + color="tab:blue", + line, + linewidth=2, + label="sensor_3", +) +plt.grid(axis="y") +plt.tick_params(axis="x", which="both", bottom=False, labelbottom=False) +plt.legend() + +anoms = list(rdf["severity"] >= minSeverity) +_, _, ymin, ymax = plt.axis() +plt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color="r", alpha=0.8) + +plt.legend() +plt.title( + "A plot of the values from the three sensors with the detected anomalies highlighted in red." +) +plt.show() + +####### Severity Figure ####### +plt.figure(figsize=(23, 1)) +plt.tick_params(axis="x", which="both", bottom=False, labelbottom=False) +plt.plot( + rdf["timestamp"], + rdf["severity"], + color="black", + line, + linewidth=2, + label="Severity score", +) +plt.plot( + rdf["timestamp"], + [minSeverity] * len(rdf["severity"]), + color="red", + line, + linewidth=1, + label="minSeverity", +) +plt.grid(axis="y") +plt.legend() +plt.ylim([0, 1]) +plt.title("Severity of the detected anomalies") +plt.show() + +####### Contributors Figure ####### +plt.figure(figsize=(23, 1)) +plt.tick_params(axis="x", which="both", bottom=False, labelbottom=False) +plt.bar( + rdf["timestamp"], rdf["series_1"], width=2, color="tab:orange", label="sensor_1" +) +plt.bar( + rdf["timestamp"], + rdf["series_2"], + width=2, + color="tab:green", + label="sensor_2", + bottom=rdf["series_1"], +) +plt.bar( + rdf["timestamp"], + rdf["series_3"], + width=2, + color="tab:blue", + label="sensor_3", + bottom=rdf["series_1"] + rdf["series_2"], +) +plt.grid(axis="y") +plt.legend() +plt.ylim([0, 1]) +plt.title("The contribution of each sensor to the detected anomaly") +plt.show() +``` + + + +The plots show the raw data from the sensors (inside the inference window) in orange, green, and blue. The red vertical lines in the first figure show the detected anomalies that have a severity greater than or equal to `minSeverity`. + +The second plot shows the severity score of all the detected anomalies, with the `minSeverity` threshold shown in the dotted red line. + +Finally, the last plot shows the contribution of the data from each sensor to the detected anomalies. It helps us diagnose and understand the most likely cause of each anomaly. diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Overview.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Overview.md new file mode 100644 index 0000000000..c973358ea9 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Overview.md @@ -0,0 +1,563 @@ +--- +title: Overview +hide_title: true +status: stable +--- +# Azure AI services + + + +Azure AI services help developers and organizations rapidly create intelligent, cutting-edge, market-ready, and responsible applications with out-of-the-box and pre-built and customizable APIs and models. + +SynapseML allows you to build powerful and highly scalable predictive and analytical models from various Spark data sources. Synapse Spark provide built-in SynapseML libraries including synapse.ml.services. + +## Important +Starting on the 20th of September, 2023 you won’t be able to create new Anomaly Detector resources. The Anomaly Detector service is being retired on the 1st of October, 2026. + +## Prerequisites on Azure Databricks + +1. Follow the steps in [Getting started](https://docs.microsoft.com/azure/services-services/big-data/getting-started) to set up your Azure Databricks and Azure AI services environment. This tutorial shows you how to install SynapseML and how to create your Spark cluster in Databricks. +1. After you create a new notebook in Azure Databricks, copy the **Shared code** below and paste into a new cell in your notebook. +1. Choose a service sample, below, and copy paste it into a second new cell in your notebook. +1. Replace any of the service subscription key placeholders with your own key. +1. Choose the run button (triangle icon) in the upper right corner of the cell, then select **Run Cell**. +1. View results in a table below the cell. + +## Prerequisites on Azure Synapse Analytics + +The tutorial, [Pre-requisites for using Azure AI services in Azure Synapse](https://learn.microsoft.com/azure/synapse-analytics/machine-learning/tutorial-configure-cognitive-services-synapse), walks you through a couple steps you need to perform before using Azure AI services in Synapse Analytics. + + +[Azure AI services](https://azure.microsoft.com/products/ai-services/) is a suite of APIs, SDKs, and services that developers can use to add intelligent features to their applications. AI services empower developers even when they don't have direct AI or data science skills or knowledge. Azure AI services help developers create applications that can see, hear, speak, understand, and even begin to reason. The catalog of services within Azure AI services can be categorized into five main pillars: Vision, Speech, Language, Web search, and Decision. + +## Usage + +### Vision +[**Computer Vision**](https://azure.microsoft.com/services/cognitive-services/computer-vision/) +- Describe: provides description of an image in human readable language ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/DescribeImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.DescribeImage)) +- Analyze (color, image type, face, adult/racy content): analyzes visual features of an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/AnalyzeImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.AnalyzeImage)) +- OCR: reads text from an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/OCR.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.OCR)) +- Recognize Text: reads text from an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/RecognizeText.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.RecognizeText)) +- Thumbnail: generates a thumbnail of user-specified size from the image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/GenerateThumbnails.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.GenerateThumbnails)) +- Recognize domain-specific content: recognizes domain-specific content (celebrity, landmark) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/RecognizeDomainSpecificContent.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.RecognizeDomainSpecificContent)) +- Tag: identifies list of words that are relevant to the input image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/vision/TagImage.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.vision.html#module-synapse.ml.services.vision.TagImage)) + +[**Face**](https://azure.microsoft.com/services/cognitive-services/face/) +- Detect: detects human faces in an image ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/face/DetectFace.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.face.html#module-synapse.ml.services.face.DetectFace)) +- Verify: verifies whether two faces belong to a same person, or a face belongs to a person ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/face/VerifyFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.face.html#module-synapse.ml.services.face.VerifyFaces)) +- Identify: finds the closest matches of the specific query person face from a person group ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/face/IdentifyFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.face.html#module-synapse.ml.services.face.IdentifyFaces)) +- Find similar: finds similar faces to the query face in a face list ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/face/FindSimilarFace.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.face.html#module-synapse.ml.services.face.FindSimilarFace)) +- Group: divides a group of faces into disjoint groups based on similarity ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/face/GroupFaces.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.face.html#module-synapse.ml.services.face.GroupFaces)) + +### Speech +[**Speech Services**](https://azure.microsoft.com/products/ai-services/ai-speech) +- Speech-to-text: transcribes audio streams ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/speech/SpeechToText.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.speech.html#module-synapse.ml.services.speech.SpeechToText)) +- Conversation Transcription: transcribes audio streams into live transcripts with identified speakers. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/speech/ConversationTranscription.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.speech.html#module-synapse.ml.services.speech.ConversationTranscription)) +- Text to Speech: Converts text to realistic audio ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/speech/TextToSpeech.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.speech.html#module-synapse.ml.services.speech.TextToSpeech)) + + +### Language +[**AI Language**](https://azure.microsoft.com/products/ai-services/ai-language) +- Language detection: detects language of the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/text/LanguageDetector.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.text.html#module-synapse.ml.services.text.LanguageDetector)) +- Key phrase extraction: identifies the key talking points in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/text/KeyPhraseExtractor.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.text.html#module-synapse.ml.services.text.KeyPhraseExtractor)) +- Named entity recognition: identifies known entities and general named entities in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/text/NER.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.text.html#module-synapse.ml.services.text.NER)) +- Sentiment analysis: returns a score between 0 and 1 indicating the sentiment in the input text ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/text/TextSentiment.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.text.html#module-synapse.ml.services.text.TextSentiment)) +- Healthcare Entity Extraction: Extracts medical entities and relationships from text. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/text/AnalyzeHealthText.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.text.html#module-synapse.ml.services.text.AnalyzeHealthText)) + + +### Translation +[**Translator**](https://azure.microsoft.com/products/ai-services/translator) +- Translate: Translates text. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/Translate.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.Translate)) +- Transliterate: Converts text in one language from one script to another script. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/Transliterate.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.Transliterate)) +- Detect: Identifies the language of a piece of text. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/Detect.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.Detect)) +- BreakSentence: Identifies the positioning of sentence boundaries in a piece of text. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/BreakSentence.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.BreakSentence)) +- Dictionary Lookup: Provides alternative translations for a word and a small number of idiomatic phrases. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/DictionaryLookup.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.DictionaryLookup)) +- Dictionary Examples: Provides examples that show how terms in the dictionary are used in context. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/DictionaryExamples.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.DictionaryExamples)) +- Document Translation: Translates documents across all supported languages and dialects while preserving document structure and data format. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/translate/DocumentTranslator.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.translate.html#module-synapse.ml.services.translate.DocumentTranslator)) + +### Document Intelligence +[**Document Intelligence**](https://azure.microsoft.com/products/ai-services/ai-document-intelligence/) +- Analyze Layout: Extract text and layout information from a given document. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeLayout.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeLayout)) +- Analyze Receipts: Detects and extracts data from receipts using optical character recognition (OCR) and our receipt model, enabling you to easily extract structured data from receipts such as merchant name, merchant phone number, transaction date, transaction total, and more. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeReceipts.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeReceipts)) +- Analyze Business Cards: Detects and extracts data from business cards using optical character recognition (OCR) and our business card model, enabling you to easily extract structured data from business cards such as contact names, company names, phone numbers, emails, and more. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeBusinessCards.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeBusinessCards)) +- Analyze Invoices: Detects and extracts data from invoices using optical character recognition (OCR) and our invoice understanding deep learning models, enabling you to easily extract structured data from invoices such as customer, vendor, invoice ID, invoice due date, total, invoice amount due, tax amount, ship to, bill to, line items and more. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeInvoices.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeInvoices)) +- Analyze ID Documents: Detects and extracts data from identification documents using optical character recognition (OCR) and our ID document model, enabling you to easily extract structured data from ID documents such as first name, last name, date of birth, document number, and more. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeIDDocuments.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeIDDocuments)) +- Analyze Custom Form: Extracts information from forms (PDFs and images) into structured data based on a model created from a set of representative training forms. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/AnalyzeCustomModel.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.AnalyzeCustomModel)) +- Get Custom Model: Get detailed information about a custom model. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/GetCustomModel.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/ListCustomModels.html)) +- List Custom Models: Get information about all custom models. ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/form/ListCustomModels.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.ListCustomModels)) + +### Decision +[**Anomaly Detector**](https://azure.microsoft.com/products/ai-services/ai-anomaly-detector) +- Anomaly status of latest point: generates a model using preceding points and determines whether the latest point is anomalous ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/anomaly/DetectLastAnomaly.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.anomaly.html#module-synapse.ml.services.anomaly.DetectLastAnomaly)) +- Find anomalies: generates a model using an entire series and finds anomalies in the series ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/anomaly/DetectAnomalies.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.anomaly.html#module-synapse.ml.services.anomaly.DetectAnomalies)) + +### Search +- [**Bing Image search**](https://azure.microsoft.com/services/services-services/bing-image-search-api/) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/bing/BingImageSearch.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.bing.html#module-synapse.ml.services.bing.BingImageSearch)) +- [**Azure Cognitive search**](https://docs.microsoft.com/azure/search/search-what-is-azure-search) ([Scala](https://mmlspark.blob.core.windows.net/docs/1.0.7/scala/com/microsoft/azure/synapse/ml/services/search/AzureSearchWriter$.html), [Python](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.search.html#module-synapse.ml.services.search.AzureSearchWriter)) + +## Prepare your system + +To begin, import required libraries and initialize your Spark session. + + +```python +from pyspark.sql.functions import udf, col +from synapse.ml.io.http import HTTPTransformer, http_udf +from requests import Request +from pyspark.sql.functions import lit +from pyspark.ml import PipelineModel +from pyspark.sql.functions import col +``` + +Import Azure AI services libraries and replace the keys and locations in the following code snippet with your Azure AI services key and location. + + +```python +from synapse.ml.services import * +from synapse.ml.core.platform import * + +# A general AI services key for AI Language, Computer Vision and Document Intelligence (or use separate keys that belong to each service) +service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. e.g. service_key="27snaiw..." +service_loc = "eastus" + +# A Bing Search v7 subscription key +bing_search_key = find_secret( + secret_name="bing-search-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. + +# An Anomaly Detector subscription key +anomaly_key = find_secret( + secret_name="anomaly-api-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. If you don't have an anomaly detection resource created before Sep 20th 2023, you won't be able to create one. +anomaly_loc = "westus2" + +# A Translator subscription key +translator_key = find_secret( + secret_name="translator-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. +translator_loc = "eastus" + +# An Azure search key +search_key = find_secret( + secret_name="azure-search-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. +``` + +## Perform sentiment analysis on text + +The [AI Language](https://azure.microsoft.com/products/ai-services/ai-language/) service provides several algorithms for extracting intelligent insights from text. For example, we can find the sentiment of given input text. The service will return a score between 0.0 and 1.0 where low scores indicate negative sentiment and high score indicates positive sentiment. This sample uses three simple sentences and returns the sentiment for each. + + +```python +# Create a dataframe that's tied to it's column names +df = spark.createDataFrame( + [ + ("I am so happy today, its sunny!", "en-US"), + ("I am frustrated by this rush hour traffic", "en-US"), + ("The AI services on spark aint bad", "en-US"), + ], + ["text", "language"], +) + +# Run the Text Analytics service with options +sentiment = ( + AnalyzeText() + .setKind("SentimentAnalysis") + .setTextCol("text") + .setLocation(service_loc) + .setSubscriptionKey(service_key) + .setOutputCol("sentiment") + .setErrorCol("error") + .setLanguageCol("language") +) + +# Show the results of your text query in a table format +display( + sentiment.transform(df).select( + "text", col("sentiment.documents.sentiment").alias("sentiment") + ) +) +``` + +## Perform text analytics for health data + +The [Text Analytics for Health Service](https://docs.microsoft.com/azure/ai-services/language-service/text-analytics-for-health/overview?tabs=ner) extracts and labels relevant medical information from unstructured text such as doctor's notes, discharge summaries, clinical documents, and electronic health records. + +The following code sample analyzes and transforms text from doctors notes into structured data. + + +```python +df = spark.createDataFrame( + [ + ("20mg of ibuprofen twice a day",), + ("1tsp of Tylenol every 4 hours",), + ("6-drops of Vitamin B-12 every evening",), + ], + ["text"], +) + +healthcare = ( + AnalyzeHealthText() + .setSubscriptionKey(service_key) + .setLocation(service_loc) + .setLanguage("en") + .setOutputCol("response") +) + +display(healthcare.transform(df)) +``` + +## Translate text into a different language +[Translator](https://azure.microsoft.com/services/ai-services/translator/) is a cloud-based machine translation service and is part of the Azure AI services family of AI APIs used to build intelligent apps. Translator is easy to integrate in your applications, websites, tools, and solutions. It allows you to add multi-language user experiences in 90 languages and dialects and can be used to translate text without hosting your own algorithm. + +The following code sample does a simple text translation by providing the sentences you want to translate and target languages you want to translate them to. + + +```python +from pyspark.sql.functions import col, flatten + +# Create a dataframe including sentences you want to translate +df = spark.createDataFrame( + [(["Hello, what is your name?", "Bye"],)], + [ + "text", + ], +) + +# Run the Translator service with options +translate = ( + Translate() + .setSubscriptionKey(translator_key) + .setLocation(translator_loc) + .setTextCol("text") + .setToLanguage(["zh-Hans"]) + .setOutputCol("translation") +) + +# Show the results of the translation. +display( + translate.transform(df) + .withColumn("translation", flatten(col("translation.translations"))) + .withColumn("translation", col("translation.text")) + .select("translation") +) +``` + +## Extract information from a document into structured data +[Azure AI Document Intelligence](https://azure.microsoft.com/products/ai-services/ai-document-intelligence/) is a part of Azure Applied AI Services that lets you build automated data processing software using machine learning technology. With Azure AI Document Intelligence, you can identify and extract text, key/value pairs, selection marks, tables, and structure from your documents. The service outputs structured data that includes the relationships in the original file, bounding boxes, confidence and more. + +The following code sample analyzes a business card image and extracts its information into structured data. + + +```python +from pyspark.sql.functions import col, explode + +# Create a dataframe containing the source files +imageDf = spark.createDataFrame( + [ + ( + "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg", + ) + ], + [ + "source", + ], +) + +# Run the Form Recognizer service +analyzeBusinessCards = ( + AnalyzeBusinessCards() + .setSubscriptionKey(service_key) + .setLocation(service_loc) + .setImageUrlCol("source") + .setOutputCol("businessCards") +) + +# Show the results of recognition. +display( + analyzeBusinessCards.transform(imageDf) + .withColumn( + "documents", explode(col("businessCards.analyzeResult.documentResults.fields")) + ) + .select("source", "documents") +) +``` + +## Computer Vision sample + +[Azure AI Vision](https://azure.microsoft.com/products/ai-services/ai-vision/) analyzes images to identify structure such as faces, objects, and natural-language descriptions. + +The following code sample analyzes images and labels them with *tags*. Tags are one-word descriptions of things in the image, such as recognizable objects, people, scenery, and actions. + + +```python +# Create a dataframe with the image URLs +base_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/" +df = spark.createDataFrame( + [ + (base_url + "objects.jpg",), + (base_url + "dog.jpg",), + (base_url + "house.jpg",), + ], + [ + "image", + ], +) + +# Run the Computer Vision service. Analyze Image extracts information from/about the images. +analysis = ( + AnalyzeImage() + .setLocation(service_loc) + .setSubscriptionKey(service_key) + .setVisualFeatures( + ["Categories", "Color", "Description", "Faces", "Objects", "Tags"] + ) + .setOutputCol("analysis_results") + .setImageUrlCol("image") + .setErrorCol("error") +) + +# Show the results of what you wanted to pull out of the images. +display(analysis.transform(df).select("image", "analysis_results.description.tags")) +``` + +## Search for images that are related to a natural language query + +[Bing Image Search](https://www.microsoft.com/bing/apis/bing-image-search-api) searches the web to retrieve images related to a user's natural language query. + +The following code sample uses a text query that looks for images with quotes. The output of the code is a list of image URLs that contain photos related to the query. + + +```python +# Number of images Bing will return per query +imgsPerBatch = 10 +# A list of offsets, used to page into the search results +offsets = [(i * imgsPerBatch,) for i in range(100)] +# Since web content is our data, we create a dataframe with options on that data: offsets +bingParameters = spark.createDataFrame(offsets, ["offset"]) + +# Run the Bing Image Search service with our text query +bingSearch = ( + BingImageSearch() + .setSubscriptionKey(bing_search_key) + .setOffsetCol("offset") + .setQuery("Martin Luther King Jr. quotes") + .setCount(imgsPerBatch) + .setOutputCol("images") +) + +# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column +getUrls = BingImageSearch.getUrlTransformer("images", "url") + +# This displays the full results returned, uncomment to use +# display(bingSearch.transform(bingParameters)) + +# Since we have two services, they are put into a pipeline +pipeline = PipelineModel(stages=[bingSearch, getUrls]) + +# Show the results of your search: image URLs +display(pipeline.transform(bingParameters)) +``` + +## Transform speech to text +The [Speech-to-text](https://azure.microsoft.com/products/ai-services/ai-speech/) service converts streams or files of spoken audio to text. The following code sample transcribes one audio file to text. + + +```python +# Create a dataframe with our audio URLs, tied to the column called "url" +df = spark.createDataFrame( + [("https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav",)], ["url"] +) + +# Run the Speech-to-text service to translate the audio into text +speech_to_text = ( + SpeechToTextSDK() + .setSubscriptionKey(service_key) + .setLocation(service_loc) + .setOutputCol("text") + .setAudioDataCol("url") + .setLanguage("en-US") + .setProfanity("Masked") +) + +# Show the results of the translation +display(speech_to_text.transform(df).select("url", "text.DisplayText")) +``` + +## Transform text to speech +[Text to speech](https://azure.microsoft.com/products/ai-services/text-to-speech/) is a service that allows you to build apps and services that speak naturally, choosing from more than 270 neural voices across 119 languages and variants. + +The following code sample transforms text into an audio file that contains the content of the text. + + +```python +from synapse.ml.services.speech import TextToSpeech + +fs = "" +if running_on_databricks(): + fs = "dbfs:" +elif running_on_synapse_internal(): + fs = "Files" + +# Create a dataframe with text and an output file location +df = spark.createDataFrame( + [ + ( + "Reading out loud is fun! Check out aka.ms/spark for more information", + fs + "/output.mp3", + ) + ], + ["text", "output_file"], +) + +tts = ( + TextToSpeech() + .setSubscriptionKey(service_key) + .setTextCol("text") + .setLocation(service_loc) + .setVoiceName("en-US-JennyNeural") + .setOutputFileCol("output_file") +) + +# Check to make sure there were no errors during audio creation +display(tts.transform(df)) +``` + +## Detect anomalies in time series data + +If you don't have an anomaly detection resource created before Sep 20th 2023, you won't be able to create one. You may want to skip this part. + +[Anomaly Detector](https://azure.microsoft.com/services/cognitive-services/anomaly-detector/) is great for detecting irregularities in your time series data. The following code sample uses the Anomaly Detector service to find anomalies in a time series. + + +```python +# Create a dataframe with the point data that Anomaly Detector requires +df = spark.createDataFrame( + [ + ("1972-01-01T00:00:00Z", 826.0), + ("1972-02-01T00:00:00Z", 799.0), + ("1972-03-01T00:00:00Z", 890.0), + ("1972-04-01T00:00:00Z", 900.0), + ("1972-05-01T00:00:00Z", 766.0), + ("1972-06-01T00:00:00Z", 805.0), + ("1972-07-01T00:00:00Z", 821.0), + ("1972-08-01T00:00:00Z", 20000.0), + ("1972-09-01T00:00:00Z", 883.0), + ("1972-10-01T00:00:00Z", 898.0), + ("1972-11-01T00:00:00Z", 957.0), + ("1972-12-01T00:00:00Z", 924.0), + ("1973-01-01T00:00:00Z", 881.0), + ("1973-02-01T00:00:00Z", 837.0), + ("1973-03-01T00:00:00Z", 9000.0), + ], + ["timestamp", "value"], +).withColumn("group", lit("series1")) + +# Run the Anomaly Detector service to look for irregular data +anamoly_detector = ( + SimpleDetectAnomalies() + .setSubscriptionKey(anomaly_key) + .setLocation(anomaly_loc) + .setTimestampCol("timestamp") + .setValueCol("value") + .setOutputCol("anomalies") + .setGroupbyCol("group") + .setGranularity("monthly") +) + +# Show the full results of the analysis with the anomalies marked as "True" +display( + anamoly_detector.transform(df).select("timestamp", "value", "anomalies.isAnomaly") +) +``` + +## Get information from arbitrary web APIs + +With HTTP on Spark, any web service can be used in your big data pipeline. In this example, we use the [World Bank API](http://api.worldbank.org/v2/country/) to get information about various countries around the world. + + +```python +# Use any requests from the python requests library + + +def world_bank_request(country): + return Request( + "GET", "http://api.worldbank.org/v2/country/{}?format=json".format(country) + ) + + +# Create a dataframe with specifies which countries we want data on +df = spark.createDataFrame([("br",), ("usa",)], ["country"]).withColumn( + "request", http_udf(world_bank_request)(col("country")) +) + +# Much faster for big data because of the concurrency :) +client = ( + HTTPTransformer().setConcurrency(3).setInputCol("request").setOutputCol("response") +) + +# Get the body of the response + + +def get_response_body(resp): + return resp.entity.content.decode() + + +# Show the details of the country data returned +display( + client.transform(df).select( + "country", udf(get_response_body)(col("response")).alias("response") + ) +) +``` + +## Azure AI search sample + +In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using SynapseML. + + +```python +search_service = "mmlspark-azure-search" +search_index = "test-33467690" + +df = spark.createDataFrame( + [ + ( + "upload", + "0", + "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", + ), + ( + "upload", + "1", + "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg", + ), + ], + ["searchAction", "id", "url"], +) + +tdf = ( + AnalyzeImage() + .setSubscriptionKey(service_key) + .setLocation(service_loc) + .setImageUrlCol("url") + .setOutputCol("analyzed") + .setErrorCol("errors") + .setVisualFeatures( + ["Categories", "Tags", "Description", "Faces", "ImageType", "Color", "Adult"] + ) + .transform(df) + .select("*", "analyzed.*") + .drop("errors", "analyzed") +) + +tdf.writeToAzureSearch( + subscriptionKey=search_key, + actionCol="searchAction", + serviceName=search_service, + indexName=search_index, + keyCol="id", +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.md new file mode 100644 index 0000000000..649df4ef24 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Celebrity Quotes.md @@ -0,0 +1,162 @@ +--- +title: Quickstart - Analyze Celebrity Quotes +hide_title: true +status: stable +--- +# Celebrity Quote Analysis with The Azure AI Services + + + + +```python +from synapse.ml.services import * +from pyspark.ml import PipelineModel +from pyspark.sql.functions import col, udf +from pyspark.ml.feature import SQLTransformer +from synapse.ml.core.platform import find_secret + +# put your service keys here +ai_service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) +ai_service_location = "eastus" +bing_search_key = find_secret( + secret_name="bing-search-key", keyvault="mmlspark-build-keys" +) +``` + +### Extracting celebrity quote images using Bing Image Search on Spark + +Here we define two Transformers to extract celebrity quote images. + + + + +```python +imgsPerBatch = 10 # the number of images Bing will return for each query +offsets = [ + (i * imgsPerBatch,) for i in range(100) +] # A list of offsets, used to page into the search results +bingParameters = spark.createDataFrame(offsets, ["offset"]) + +bingSearch = ( + BingImageSearch() + .setSubscriptionKey(bing_search_key) + .setOffsetCol("offset") + .setQuery("celebrity quotes") + .setCount(imgsPerBatch) + .setOutputCol("images") +) + +# Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column +getUrls = BingImageSearch.getUrlTransformer("images", "url") +``` + +### Recognizing Images of Celebrities +This block identifies the name of the celebrities for each of the images returned by the Bing Image Search. + + + + +```python +celebs = ( + RecognizeDomainSpecificContent() + .setSubscriptionKey(ai_service_key) + .setLocation(ai_service_location) + .setModel("celebrities") + .setImageUrlCol("url") + .setOutputCol("celebs") +) + +# Extract the first celebrity we see from the structured response +firstCeleb = SQLTransformer( + statement="SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__" +) +``` + +### Reading the quote from the image. +This stage performs OCR on the images to recognize the quotes. + + + + +```python +from synapse.ml.stages import UDFTransformer + +recognizeText = ( + RecognizeText() + .setSubscriptionKey(ai_service_key) + .setLocation(ai_service_location) + .setImageUrlCol("url") + .setMode("Printed") + .setOutputCol("ocr") + .setConcurrency(5) +) + + +def getTextFunction(ocrRow): + if ocrRow is None: + return None + return "\n".join([line.text for line in ocrRow.recognitionResult.lines]) + + +# this transformer wil extract a simpler string from the structured output of recognize text +getText = ( + UDFTransformer() + .setUDF(udf(getTextFunction)) + .setInputCol("ocr") + .setOutputCol("text") +) +``` + +### Understanding the Sentiment of the Quote + + + + +```python +sentimentTransformer = ( + TextSentiment() + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("sentiment") +) + +# Extract the sentiment score from the API response body +getSentiment = SQLTransformer( + statement="SELECT *, sentiment.document.sentiment as sentimentLabel FROM __THIS__" +) +``` + +### Tying it all together + +Now that we have built the stages of our pipeline it's time to chain them together into a single model that can be used to process batches of incoming data + + + + +```python +from synapse.ml.stages import SelectColumns + +# Select the final coulmns +cleanupColumns = SelectColumns().setCols( + ["url", "firstCeleb", "text", "sentimentLabel"] +) + +celebrityQuoteAnalysis = PipelineModel( + stages=[ + bingSearch, + getUrls, + celebs, + firstCeleb, + recognizeText, + getText, + sentimentTransformer, + getSentiment, + cleanupColumns, + ] +) + +celebrityQuoteAnalysis.transform(bingParameters).show(5) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Text.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Text.md new file mode 100644 index 0000000000..5224114a11 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Analyze Text.md @@ -0,0 +1,273 @@ +--- +title: Quickstart - Analyze Text +hide_title: true +status: stable +--- +# Analyze Text with SynapseML and Azure AI Language +[Azure AI Language](https://learn.microsoft.com/azure/ai-services/language-service/overview) is a cloud-based service that provides Natural Language Processing (NLP) features for understanding and analyzing text. Use this service to help build intelligent applications using the web-based Language Studio, REST APIs, and client libraries. +You can use SynapseML with Azure AI Language for **named entity recognition**, **language detection**, **entity linking**, **key phrase extraction**, **Pii entity recognition** and **sentiment analysis**. + + +```python +from synapse.ml.services.language import AnalyzeText +from synapse.ml.core.platform import find_secret + +ai_service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) +ai_service_location = "eastus" +``` + +## Named Entity Recognition +[Named Entity Recognition](https://learn.microsoft.com/azure/ai-services/language-service/named-entity-recognition/overview) is one of the features offered by Azure AI Language, a collection of machine learning and AI algorithms in the cloud for developing intelligent applications that involve written language. The NER feature can identify and categorize entities in unstructured text. For example: people, places, organizations, and quantities. Refer to [this article](https://learn.microsoft.com/azure/ai-services/language-service/named-entity-recognition/language-support?tabs=ga-api) for the full list of supported languages. + + +```python +df = spark.createDataFrame( + data=[ + ["en", "Dr. Smith has a very modern medical office, and she has great staff."], + ["en", "I had a wonderful trip to Seattle last week."], + ], + schema=["language", "text"], +) + +entity_recognition = ( + AnalyzeText() + .setKind("EntityRecognition") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("entities") + .setErrorCol("error") + .setLanguageCol("language") +) + +df_results = entity_recognition.transform(df) +display(df_results.select("language", "text", "entities.documents.entities")) +``` + +This cell should yield a result that looks like: + + +| language | text | entities | +|:--------------|:-----------|:------------| +| en | Dr. Smith has a very modern medical office, and she has great staff. | [{"category": "Person", "confidenceScore": 0.98, "length": 5, "offset": 4, "subcategory": null, "text": "Smith"}, {"category": "Location", "confidenceScore": 0.79, "length": 14, "offset": 28, "subcategory": "Structural", "text": "medical office"}, {"category": "PersonType", "confidenceScore": 0.85, "length": 5, "offset": 62, "subcategory": null, "text": "staff"}] | +| en | I had a wonderful trip to Seattle last week. | [{"category": "Event", "confidenceScore": 0.74, "length": 4, "offset": 18, "subcategory": null, "text": "trip"}, {"category": "Location", "confidenceScore": 1, "length": 7, "offset": 26, "subcategory": "GPE", "text": "Seattle"}, {"category": "DateTime", "confidenceScore": 0.8, "length": 9, "offset": 34, "subcategory": "DateRange", "text": "last week"}] | + +## LanguageDetection +[Language detection](https://learn.microsoft.com/azure/ai-services/language-service/language-detection/overview) can detect the language a document is written in. It returns a language code for a wide range of languages, variants, dialects, and some regional/cultural languages. Refer to [this article](https://learn.microsoft.com/azure/ai-services/language-service/language-detection/language-support) for the full list of supported languages. + + +```python +df = spark.createDataFrame( + data=[ + ["This is a document written in English."], + ["这是一份用中文写的文件"], + ], + schema=["text"], +) + +language_detection = ( + AnalyzeText() + .setKind("LanguageDetection") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("detected_language") + .setErrorCol("error") +) + +df_results = language_detection.transform(df) +display(df_results.select("text", "detected_language.documents.detectedLanguage")) +``` + +This cell should yield a result that looks like: + + +| text | detectedLanguage | +|:-----------|:------------| +| This is a document written in English. | {"name": "English", "iso6391Name": "en", "confidenceScore": 0.99} | +| 这是一份用中文写的文件 | {"name": "Chinese_Simplified", "iso6391Name": "zh_chs", "confidenceScore": 1} | + +## EntityLinking +[Entity linking](https://learn.microsoft.com/azure/ai-services/language-service/entity-linking/overview) identifies and disambiguates the identity of entities found in text. For example, in the sentence "We went to Seattle last week.", the word "Seattle" would be identified, with a link to more information on Wikipedia. [English and Spanish are supported](https://learn.microsoft.com/azure/ai-services/language-service/entity-linking/language-support). + + +```python +df = spark.createDataFrame( + data=[ + ["Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975."], + ["We went to Seattle last week."], + ], + schema=["text"], +) + +entity_linking = ( + AnalyzeText() + .setKind("EntityLinking") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("entity_linking") + .setErrorCol("error") +) + +df_results = entity_linking.transform(df) +display(df_results.select("text", "entity_linking.documents.entities")) +``` + +This cell should yield a result that looks like: + + +| text | entities | +|:-----------|:------------| +| Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975. | [{"bingId": "a093e9b9-90f5-a3d5-c4b8-5855e1b01f85", "dataSource": "Wikipedia", "id": "Microsoft", "language": "en", "matches": [{"confidenceScore": 0.48, "length": 9, "offset": 0, "text": "Microsoft"}], "name": "Microsoft", "url": "https://en.wikipedia.org/wiki/Microsoft"}, {"bingId": "0d47c987-0042-5576-15e8-97af601614fa", "dataSource": "Wikipedia", "id": "Bill Gates", "language": "en", "matches": [{"confidenceScore": 0.52, "length": 10, "offset": 25, "text": "Bill Gates"}], "name": "Bill Gates", "url": "https://en.wikipedia.org/wiki/Bill_Gates"}, {"bingId": "df2c4376-9923-6a54-893f-2ee5a5badbc7", "dataSource": "Wikipedia", "id": "Paul Allen", "language": "en", "matches": [{"confidenceScore": 0.54, "length": 10, "offset": 40, "text": "Paul Allen"}], "name": "Paul Allen", "url": "https://en.wikipedia.org/wiki/Paul_Allen"}, {"bingId": "52535f87-235e-b513-54fe-c03e4233ac6e", "dataSource": "Wikipedia", "id": "April 4", "language": "en", "matches": [{"confidenceScore": 0.38, "length": 7, "offset": 54, "text": "April 4"}], "name": "April 4", "url": "https://en.wikipedia.org/wiki/April_4"}] | +| We went to Seattle last week. | [{"bingId": "5fbba6b8-85e1-4d41-9444-d9055436e473", "dataSource": "Wikipedia", "id": "Seattle", "language": "en", "matches": [{"confidenceScore": 0.17, "length": 7, "offset": 11, "text": "Seattle"}], "name": "Seattle", "url": "https://en.wikipedia.org/wiki/Seattle"}] | + +## KeyPhraseExtraction +[Key phrase extraction](https://learn.microsoft.com/en-us/azure/ai-services/language-service/key-phrase-extraction/overview) is one of the features offered by Azure AI Language, a collection of machine learning and AI algorithms in the cloud for developing intelligent applications that involve written language. Use key phrase extraction to quickly identify the main concepts in text. For example, in the text "The food was delicious and the staff were wonderful.", key phrase extraction will return the main topics: "food" and "wonderful staff". Refer to [this article](https://learn.microsoft.com/azure/ai-services/language-service/key-phrase-extraction/language-support) for the full list of supported languages. + + +```python +df = spark.createDataFrame( + data=[ + ["Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975."], + ["Dr. Smith has a very modern medical office, and she has great staff."], + ], + schema=["text"], +) + +key_phrase_extraction = ( + AnalyzeText() + .setKind("KeyPhraseExtraction") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("key_phrase_extraction") + .setErrorCol("error") +) + +df_results = key_phrase_extraction.transform(df) +display(df_results.select("text", "key_phrase_extraction.documents.keyPhrases")) +``` + +This cell should yield a result that looks like: + + +| text | keyPhrases | +|:-----------|:------------| +| Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975. | ["Bill Gates", "Paul Allen", "Microsoft", "April"] | +| Dr. Smith has a very modern medical office, and she has great staff. | ["modern medical office", "Dr. Smith", "great staff"] | + +## PiiEntityRecognition +The PII detection feature can identify, categorize, and redact sensitive information in unstructured text. For example: phone numbers, email addresses, and forms of identification. The method for utilizing PII in conversations is different than other use cases, and articles for this use have been separated. Refer to [this article](https://learn.microsoft.com/azure/ai-services/language-service/personally-identifiable-information/language-support?tabs=documents) for the full list of supported languages. + + +```python +df = spark.createDataFrame( + data=[ + ["Call our office at 312-555-1234, or send an email to support@contoso.com"], + ["Dr. Smith has a very modern medical office, and she has great staff."], + ], + schema=["text"], +) + +pii_entity_recognition = ( + AnalyzeText() + .setKind("PiiEntityRecognition") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("pii_entity_recognition") + .setErrorCol("error") +) + +df_results = pii_entity_recognition.transform(df) +display(df_results.select("text", "pii_entity_recognition.documents.entities")) +``` + +This cell should yield a result that looks like: + + +| text | entities | +|:-----------|:------------| +| Call our office at 312-555-1234, or send an email to support@contoso.com | [{"category": "PhoneNumber", "confidenceScore": 0.8, "length": 12, "offset": 19, "subcategory": null, "text": "312-555-1234"}, {"category": "Email", "confidenceScore": 0.8, "length": 19, "offset": 53, "subcategory": null, "text": "support@contoso.com"}] | +| Dr. Smith has a very modern medical office, and she has great staff. | [{"category": "Person", "confidenceScore": 0.93, "length": 5, "offset": 4, "subcategory": null, "text": "Smith"}] | + +## SentimentAnalysis +[Sentiment analysis](https://learn.microsoft.com/en-us/azure/ai-services/language-service/sentiment-opinion-mining/overview) and opinion mining are features offered by the Language service, a collection of machine learning and AI algorithms in the cloud for developing intelligent applications that involve written language. These features help you find out what people think of your brand or topic by mining text for clues about positive or negative sentiment, and can associate them with specific aspects of the text. Refer to [this article](https://learn.microsoft.com/azure/ai-services/language-service/sentiment-opinion-mining/language-support) for the full list of supported languages. + + +```python +df = spark.createDataFrame( + data=[ + ["The food and service were unacceptable. The concierge was nice, however."], + ["It taste great."], + ], + schema=["text"], +) + +sentiment_analysis = ( + AnalyzeText() + .setKind("SentimentAnalysis") + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("sentiment_analysis") + .setErrorCol("error") +) + +df_results = sentiment_analysis.transform(df) +display(df_results.select("text", "sentiment_analysis.documents.sentiment")) +``` + +This cell should yield a result that looks like: + + +| text | sentiment | +|:-----------|:------------| +| The food and service were unacceptable. The concierge was nice, however. | mixed | +| It tastes great. | positive | + +## Analyze Text with TextAnalyze + +Text Analyze is Deprecated, please use AnalyzeText instead + + +```python +df = spark.createDataFrame( + data=[ + ["en", "Hello Seattle"], + ["en", "There once was a dog who lived in London and thought she was a human"], + ], + schema=["language", "text"], +) +``` + + +```python +from synapse.ml.services import * + +text_analyze = ( + TextAnalyze() + .setLocation(ai_service_location) + .setSubscriptionKey(ai_service_key) + .setTextCol("text") + .setOutputCol("textAnalysis") + .setErrorCol("error") + .setLanguageCol("language") + .setEntityRecognitionParams( + {"model-version": "latest"} + ) # Can pass parameters to each model individually + .setIncludePii(False) # Users can manually exclude tasks to speed up analysis + .setIncludeEntityLinking(False) + .setIncludeSentimentAnalysis(False) +) + +df_results = text_analyze.transform(df) +``` + + +```python +display(df_results) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.md new file mode 100644 index 0000000000..6587811695 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.md @@ -0,0 +1,121 @@ +--- +title: Quickstart - Create Audiobooks +hide_title: true +status: stable +--- +# Create audiobooks using neural Text to speech + +## Step 1: Load libraries and add service information + + +```python +from synapse.ml.core.platform import * + +if running_on_synapse(): + from notebookutils import mssparkutils + +# Fill this in with your Azure AI service information +service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) # Replace this line with a string like service_key = "dddjnbdkw9329" +service_loc = "eastus" + +storage_container = "audiobooks" +storage_key = find_secret( + secret_name="madtest-storage-key", keyvault="mmlspark-build-keys" +) +storage_account = "anomalydetectiontest" +``` + +## Step 2: Attach the storage account to hold the audio files + + +```python +spark_key_setting = f"fs.azure.account.key.{storage_account}.blob.core.windows.net" +spark.sparkContext._jsc.hadoopConfiguration().set(spark_key_setting, storage_key) +``` + + +```python +import os +from os.path import exists, join + +mount_path = f"wasbs://{storage_container}@{storage_account}.blob.core.windows.net/" +if running_on_synapse(): + mount_dir = join("/synfs", mssparkutils.env.getJobId(), storage_container) + if not exists(mount_dir): + mssparkutils.fs.mount( + mount_path, f"/{storage_container}", {"accountKey": storage_key} + ) +elif running_on_databricks(): + if not exists(f"/dbfs/mnt/{storage_container}"): + dbutils.fs.mount( + source=mount_path, + mount_point=f"/mnt/{storage_container}", + extra_configs={spark_key_setting: storage_key}, + ) +``` + +## Step 3: Read in text data + + +```python +from pyspark.sql.functions import udf + + +@udf +def make_audio_filename(part): + return f"wasbs://{storage_container}@{storage_account}.blob.core.windows.net/alice_in_wonderland/part_{part}.wav" + + +df = ( + spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/alice_in_wonderland.parquet" + ) + .repartition(10) + .withColumn("filename", make_audio_filename("part")) +) + +display(df) +``` + +## Step 4: Synthesize audio from text + +
+ +
+ + +```python +from synapse.ml.services.speech import TextToSpeech + +tts = ( + TextToSpeech() + .setSubscriptionKey(service_key) + .setTextCol("text") + .setLocation(service_loc) + .setErrorCol("error") + .setVoiceName("en-US-SteffanNeural") + .setOutputFileCol("filename") +) + +audio = tts.transform(df).cache() +display(audio) +``` + +## Step 5: Listen to an audio file + + +```python +from IPython.display import Audio + + +def get_audio_file(num): + if running_on_databricks(): + return f"/dbfs/mnt/{storage_container}/alice_in_wonderland/part_{num}.wav" + else: + return join(mount_dir, f"alice_in_wonderland/part_{num}.wav") + + +Audio(filename=get_audio_file(1)) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine.md new file mode 100644 index 0000000000..b3b1b66f44 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine.md @@ -0,0 +1,95 @@ +--- +title: Quickstart - Create a Visual Search Engine +hide_title: true +status: stable +--- +

Creating a searchable Art Database with The MET's open-access collection

+ +In this example, we show how you can enrich data using Cognitive Skills and write to an Azure Search Index using SynapseML. We use a subset of The MET's open-access collection and enrich it by passing it through 'Describe Image' and a custom 'Image Similarity' skill. The results are then written to a searchable index. + + +```python +import os, sys, time, json, requests +from pyspark.sql.functions import lit, udf, col, split +from synapse.ml.core.platform import * + +ai_service_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) +ai_service_loc = "eastus" +azure_search_key = find_secret( + secret_name="azure-search-key", keyvault="mmlspark-build-keys" +) +search_service = "mmlspark-azure-search" +search_index = "test" +``` + + +```python +data = ( + spark.read.format("csv") + .option("header", True) + .load("wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv") + .withColumn("searchAction", lit("upload")) + .withColumn("Neighbors", split(col("Neighbors"), ",").cast("array")) + .withColumn("Tags", split(col("Tags"), ",").cast("array")) + .limit(25) +) +``` + + + + +```python +from synapse.ml.services import AnalyzeImage +from synapse.ml.stages import SelectColumns + +# define pipeline +describeImage = ( + AnalyzeImage() + .setSubscriptionKey(ai_service_key) + .setLocation(ai_service_loc) + .setImageUrlCol("PrimaryImageUrl") + .setOutputCol("RawImageDescription") + .setErrorCol("Errors") + .setVisualFeatures( + ["Categories", "Description", "Faces", "ImageType", "Color", "Adult"] + ) + .setConcurrency(5) +) + +df2 = ( + describeImage.transform(data) + .select("*", "RawImageDescription.*") + .drop("Errors", "RawImageDescription") +) +``` + + + +Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer [Create a basic index in Azure Search](https://docs.microsoft.com/azure/search/search-what-is-an-index) for more information. + + +```python +from synapse.ml.services import * + +df2.writeToAzureSearch( + subscriptionKey=azure_search_key, + actionCol="searchAction", + serviceName=search_service, + indexName=search_index, + keyCol="ObjectID", +) +``` + +The Search Index can be queried using the [Azure Search REST API](https://docs.microsoft.com/rest/api/searchservice/) by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer [Query your Azure Search index using the REST API](https://docs.microsoft.com/rest/api/searchservice/Search-Documents) + + +```python +url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format( + search_service, search_index +) +requests.post( + url, json={"search": "Glass"}, headers={"api-key": azure_search_key} +).json() +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.md new file mode 100644 index 0000000000..4408fe7e36 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.md @@ -0,0 +1,337 @@ +--- +title: Quickstart - Document Question and Answering with PDFs +hide_title: true +status: stable +--- +# A Guide to Q&A on PDF Documents + +## Introduction +In this notebook, we'll demonstrate how to develop a context-aware question answering framework for any form of a document using [OpenAI models](https://azure.microsoft.com/products/ai-services/openai-service), [SynapseML](https://microsoft.github.io/SynapseML/) and [Azure AI Services](https://azure.microsoft.com/products/ai-services/). In this notebook, we assume that PDF documents are the source of data, however, the same framework can be easiy extended to other document formats too. + +We’ll cover the following key steps: + +1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks. +2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/products/ai-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/products/search), and search the vector store to answer the user’s question. +3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#). + +We start by installing the necessary python libraries. + + +```python +%pip install openai==0.28.1 langchain==0.0.331 +``` + +### Step 1: Provide the keys for Azure AI Services and Azure OpenAI to authenticate the applications. + +To authenticate Azure AI Services and Azure OpenAI applications, you need to provide the respective API keys. Here is an example of how you can provide the keys in Python code. `find_secret()` function uses Azure Keyvault to get the API keys, however you can directly paste your own keys there. + + +```python +from pyspark.sql import SparkSession +from synapse.ml.core.platform import find_secret + +ai_services_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) +ai_services_location = "eastus" + +# Fill in the following lines with your Azure service information +aoai_service_name = "synapseml-openai-2" +aoai_endpoint = f"https://{aoai_service_name}.openai.azure.com/" +aoai_key = find_secret(secret_name="openai-api-key-2", keyvault="mmlspark-build-keys") +aoai_deployment_name_embeddings = "text-embedding-ada-002" +aoai_deployment_name_query = "gpt-35-turbo" +aoai_model_name_query = "gpt-35-turbo" + +# Azure Cognitive Search +cogsearch_name = "mmlspark-azure-search" +cogsearch_index_name = "examplevectorindex" +cogsearch_api_key = find_secret( + secret_name="azure-search-key", keyvault="mmlspark-build-keys" +) +``` + +### Step 2: Load the PDF documents into a Spark DataFrame. + +For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format("binaryFile")``` method provided by Apache Spark. + + +```python +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType + +document_path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth" # path to your document +df = spark.read.format("binaryFile").load(document_path).limit(10).cache() +``` + +This code will read the PDF documents and create a Spark DataFrame named df with the contents of the PDFs. The DataFrame will have a schema that represents the structure of the PDF documents, including their textual content. + +Let's take a glimpse at the contents of the e-books we are working with. Below are some screenshots that showcase the essence of the books; as you can see they contain information about the Earth. + + + + +##### Display the raw data from the PDF documents + + +```python +# Show the dataframe without the content +display(df.drop("content")) +``` + +### Step 3: Read the documents using Azure AI Document Intelligence. + +We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/products/ai-services). + +Additionally, we employ AnalyzeDocument from Azure AI Services to extract the complete document content and present it in the designated columns called "output_content" and "paragraph." + + +```python +from synapse.ml.services.form import AnalyzeDocument +from pyspark.sql.functions import col + +analyze_document = ( + AnalyzeDocument() + .setPrebuiltModelId("prebuilt-layout") + .setSubscriptionKey(ai_services_key) + .setLocation(ai_services_location) + .setImageBytesCol("content") + .setOutputCol("result") + .setPages( + "1-15" + ) # Here we are reading the first 15 pages of the documents for demo purposes +) + +analyzed_df = ( + analyze_document.transform(df) + .withColumn("output_content", col("result.analyzeResult.content")) + .withColumn("paragraphs", col("result.analyzeResult.paragraphs")) +).cache() +``` + +We can observe the analayzed Spark DataFrame named ```analyzed_df``` using the following code. Note that we drop the "content" column as it is not needed anymore. + + +```python +analyzed_df = analyzed_df.drop("content") +display(analyzed_df) +``` + +### Step 4: Split the documents into chunks. + +After analyzing the document, we leverage SynapseML’s PageSplitter to divide the documents into smaller sections, which are subsequently stored in the “chunks” column. This allows for more granular representation and processing of the document content. + + +```python +from synapse.ml.featurize.text import PageSplitter + +ps = ( + PageSplitter() + .setInputCol("output_content") + .setMaximumPageLength(4000) + .setMinimumPageLength(3000) + .setOutputCol("chunks") +) + +splitted_df = ps.transform(analyzed_df) +display(splitted_df) +``` + +Note that the chunks for each document are presented in a single row inside an array. In order to embed all the chunks in the following cells, we need to have each chunk in a separate row. To accomplish that, we first explode these arrays so there is only one chunk in each row, then filter the Spark DataFrame in order to only keep the path to the document and the chunk in a single row. + + +```python +# Each column contains many chunks for the same document as a vector. +# Explode will distribute and replicate the content of a vecor across multple rows +from pyspark.sql.functions import explode, col + +exploded_df = splitted_df.select("path", explode(col("chunks")).alias("chunk")).select( + "path", "chunk" +) +display(exploded_df) +``` + +### Step 5: Generate Embeddings. + +To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/azure/cognitive-services/openai/how-to/embeddings?tabs=console). + + +```python +from synapse.ml.services.openai import OpenAIEmbedding + +embedding = ( + OpenAIEmbedding() + .setSubscriptionKey(aoai_key) + .setDeploymentName(aoai_deployment_name_embeddings) + .setCustomServiceName(aoai_service_name) + .setTextCol("chunk") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +df_embeddings = embedding.transform(exploded_df) + +display(df_embeddings) +``` + +### Step 6: Store the embeddings in Azure Cognitive Search Vector Store. + +[Azure Cognitive Search](https://learn.microsoft.com/azure/search/search-what-is-azure-search) offers a user-friendly interface for creating a vector database, as well as storing and retrieving data using vector search. If you're interested in learning more about vector search, you can look [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main). + + +Storing data in the AzureCogSearch vector database involves two main steps: + +Creating the Index: The first step is to establish the index or schema of the vector database. This entails defining the structure and properties of the data that will be stored and indexed in the vector database. + +Adding Chunked Documents and Embeddings: The second step involves adding the chunked documents, along with their corresponding embeddings, to the vector datastore. This allows for efficient storage and retrieval of the data using vector search capabilities. + +By following these steps, you can effectively store your chunked documents and their associated embeddings in the AzureCogSearch vector database, enabling seamless retrieval of relevant information through vector search functionality. + + +```python +from pyspark.sql.functions import monotonically_increasing_id +from pyspark.sql.functions import lit + +df_embeddings = ( + df_embeddings.drop("error") + .withColumn( + "idx", monotonically_increasing_id().cast("string") + ) # create index ID for ACS + .withColumn("searchAction", lit("upload")) +) +``` + + +```python +from synapse.ml.services import writeToAzureSearch +import json + +df_embeddings.writeToAzureSearch( + subscriptionKey=cogsearch_api_key, + actionCol="searchAction", + serviceName=cogsearch_name, + indexName=cogsearch_index_name, + keyCol="idx", + vectorCols=json.dumps([{"name": "embeddings", "dimension": 1536}]), +) +``` + +### Step 7: Ask a Question. + +After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed. + + +```python +user_question = "What did the astronaut Edgar Mitchell call Earth?" +retrieve_k = 2 # Retrieve the top 2 documents from vector database +``` + + +```python +import requests + +# Ask a question and convert to embeddings + + +def gen_question_embedding(user_question): + # Convert question to embedding using synapseML + from synapse.ml.services.openai import OpenAIEmbedding + + df_ques = spark.createDataFrame([(user_question, 1)], ["questions", "dummy"]) + embedding = ( + OpenAIEmbedding() + .setSubscriptionKey(aoai_key) + .setDeploymentName(aoai_deployment_name_embeddings) + .setCustomServiceName(aoai_service_name) + .setTextCol("questions") + .setErrorCol("errorQ") + .setOutputCol("embeddings") + ) + df_ques_embeddings = embedding.transform(df_ques) + row = df_ques_embeddings.collect()[0] + question_embedding = row.embeddings.tolist() + return question_embedding + + +def retrieve_k_chunk(k, question_embedding): + # Retrieve the top K entries + url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview" + + payload = json.dumps( + {"vector": {"value": question_embedding, "fields": "embeddings", "k": k}} + ) + headers = { + "Content-Type": "application/json", + "api-key": cogsearch_api_key, + } + + response = requests.request("POST", url, headers=headers, data=payload) + output = json.loads(response.text) + print(response.status_code) + return output + + +# Generate embeddings for the question and retrieve the top k document chunks +question_embedding = gen_question_embedding(user_question) +output = retrieve_k_chunk(retrieve_k, question_embedding) +``` + +### Step 8: Respond to a User’s Question. + +To provide a response to the user's question, we will utilize the [LangChain](https://python.langchain.com/en/latest/index.html) framework. With the LangChain framework we will augment the retrieved documents with respect to the user's question. Following this, we can request a response to the user's question from our framework. + + +```python +# Import necenssary libraries and setting up OpenAI +from langchain.llms import AzureOpenAI +from langchain import PromptTemplate +from langchain.chains import LLMChain +import openai + +openai.api_type = "azure" +openai.api_base = aoai_endpoint +openai.api_version = "2022-12-01" +openai.api_key = aoai_key +``` + +We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth "a sparkling blue and white jewel"! + + +```python +# Define a Question Answering chain function using LangChain +def qa_chain_func(): + + # Define llm model + llm = AzureOpenAI( + deployment_name=aoai_deployment_name_query, + model_name=aoai_model_name_query, + openai_api_key=aoai_key, + openai_api_version="2022-12-01", + ) + + # Write a preprompt with context and query as variables + template = """ + context :{context} + Answer the question based on the context above. If the + information to answer the question is not present in the given context then reply "I don't know". + Question: {query} + Answer: """ + + # Define a prompt template + prompt_template = PromptTemplate( + input_variables=["context", "query"], template=template + ) + # Define a chain + qa_chain = LLMChain(llm=llm, prompt=prompt_template) + return qa_chain + + +# Concatenate the content of retrieved documents +context = [i["chunk"] for i in output["value"]] + +# Make a Quesion Answer chain function and pass +qa_chain = qa_chain_func() +answer = qa_chain.run({"context": context, "query": user_question}) + +print(answer) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Flooding Risk.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Flooding Risk.md new file mode 100644 index 0000000000..aa80a0aff4 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Flooding Risk.md @@ -0,0 +1,193 @@ +--- +title: Quickstart - Flooding Risk +hide_title: true +status: stable +--- +# Visualizing Customer addresses on a flood plane + +King County (WA) publishes flood plain data as well as tax parcel data. We can use the addresses in the tax parcel data and use the geocoder to calculate coordinates. Using this coordinates and the flood plain data we can enrich out dataset with a flag indicating whether the house is in a flood zone or not. + +The following data has been sourced from King County's Open data portal. [_Link_](https://data.kingcounty.gov/) +1. [Address Data](https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyAddress.csv) +1. [Flood plains](https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson) + +For this demonstration, please follow the instructions on setting up your azure maps account from the overview notebook. + +## Prerequisites +1. Upload the flood plains data as map data to your creator resource + + +```python +import json +import time +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +# Configure more resiliant requests to stop flakiness +retry_strategy = Retry( + total=3, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "TRACE"], +) +adapter = HTTPAdapter(max_retries=retry_strategy) +http = requests.Session() +http.mount("https://", adapter) +http.mount("http://", adapter) +``` + + +```python +from synapse.ml.core.platform import * + +# Azure Maps account key +maps_key = find_secret( + secret_name="azuremaps-api-key", keyvault="mmlspark-build-keys" +) # Replace this with your azure maps key + +# Creator Geo prefix +# for this example, assuming that the creator resource is created in `EAST US 2`. +atlas_geo_prefix = "us" + +# Load flood plains data +flood_plain_geojson = http.get( + "https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson" +).content + +# Upload this flood plains data to your maps/creator account. This is a Long-Running async operation and takes approximately 15~30 seconds to complete +r = http.post( + f"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={maps_key}", + json=json.loads(flood_plain_geojson), +) + +# Poll for resource upload completion +resource_location = r.headers.get("location") +for _ in range(20): + resource = json.loads( + http.get(f"{resource_location}&subscription-key={maps_key}").content + ) + status = resource["status"].lower() + if status == "running": + time.sleep(5) # wait in a polling loop + elif status == "succeeded": + break + else: + raise ValueError("Unknown status {}".format(status)) + +# Once the above operation returns a HTTP 201, get the user_data_id of the flood plains data, you uploaded to your map account. +user_data_id_resource_url = resource["resourceLocation"] +user_data_id = json.loads( + http.get(f"{user_data_id_resource_url}&subscription-key={maps_key}").content +)["udid"] +``` + +Now that we have the flood plains data setup in our maps account, we can use the `CheckPointInPolygon` function to check if a location `(lat,lon)` coordinate is in a flood zone. + +### Load address data: + + +```python +data = spark.read.option("header", "true").csv( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/maps/KingCountyAddress.csv" +) + +# Visualize incoming schema +print("Schema:") +data.printSchema() + +# Choose a subset of the data for this example +subset_data = data.limit(50) +display(subset_data) +``` + +### Wire-up the Address Geocoder + +We will use the address geocoder to enrich the dataset with location coordinates of the addresses. + + +```python +from pyspark.sql.functions import col +from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch +from synapse.ml.services.geospatial import * + + +def extract_location_fields(df): + # Use this function to select only lat/lon columns into the dataframe + return df.select( + col("*"), + col("output.response.results") + .getItem(0) + .getField("position") + .getField("lat") + .alias("Latitude"), + col("output.response.results") + .getItem(0) + .getField("position") + .getField("lon") + .alias("Longitude"), + ).drop("output") + + +# Azure Maps geocoder to enhance the dataframe with location data +geocoder = ( + AddressGeocoder() + .setSubscriptionKey(maps_key) + .setAddressCol("FullAddress") + .setOutputCol("output") +) + +# Set up a fixed mini batch transformer to geocode addresses +batched_dataframe = geocoder.transform( + FixedMiniBatchTransformer().setBatchSize(10).transform(subset_data.coalesce(1)) +) +geocoded_addresses = extract_location_fields( + FlattenBatch().transform(batched_dataframe) +) + +# Display the results +display(geocoded_addresses) +``` + +Now that we have geocoded the addresses, we can now use the `CheckPointInPolygon` function to check if a property is in a flood zone or not. + +### Setup Check Point In Polygon + + +```python +def extract_point_in_polygon_result_fields(df): + # Use this function to select only lat/lon columns into the dataframe + return df.select( + col("*"), + col("output.result.pointInPolygons").alias("In Polygon"), + col("output.result.intersectingGeometries").alias("Intersecting Polygons"), + ).drop("output") + + +check_point_in_polygon = ( + CheckPointInPolygon() + .setSubscriptionKey(maps_key) + .setGeography(atlas_geo_prefix) + .setUserDataIdentifier(user_data_id) + .setLatitudeCol("Latitude") + .setLongitudeCol("Longitude") + .setOutputCol("output") +) + + +flood_plain_addresses = extract_point_in_polygon_result_fields( + check_point_in_polygon.transform(geocoded_addresses) +) + +# Display the results +display(flood_plain_addresses) +``` + +### Cleanup Uploaded User Data (Optional) +You can (optionally) delete the uploaded geojson polygon. + + +```python +res = http.delete( + f"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={maps_key}" +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.md new file mode 100644 index 0000000000..3e7f9eda79 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/AI Services/Quickstart - Predictive Maintenance.md @@ -0,0 +1,188 @@ +--- +title: Quickstart - Predictive Maintenance +hide_title: true +status: stable +--- +# Recipe: Predictive maintenance with the Azure AI Services for Big Data + +This recipe shows how you can use Azure Synapse Analytics and Azure AI services on Apache Spark for predictive maintenance of IoT devices. We'll follow along with the [CosmosDB and Synapse Link](https://github.com/Azure-Samples/cosmosdb-synapse-link-samples) sample. To keep things simple, in this recipe we'll read the data straight from a CSV file rather than getting streamed data through CosmosDB and Synapse Link. We strongly encourage you to look over the Synapse Link sample. + +## Important + +Starting on the 20th of September, 2023 you won’t be able to create new Anomaly Detector resources. The Anomaly Detector service is being retired on the 1st of October, 2026. + +## Hypothetical scenario + +The hypothetical scenario is a Power Plant, where IoT devices are monitoring [steam turbines](https://en.wikipedia.org/wiki/Steam_turbine). The IoTSignals collection has Revolutions per minute (RPM) and Megawatts (MW) data for each turbine. Signals from steam turbines are being analyzed and anomalous signals are detected. + +There could be outliers in the data in random frequency. In those situations, RPM values will go up and MW output will go down, for circuit protection. The idea is to see the data varying at the same time, but with different signals. + +## Prerequisites + +* An Azure subscription - [Create one for free](https://azure.microsoft.com/free/) +* [Azure Synapse workspace](https://docs.microsoft.com/azure/synapse-analytics/get-started-create-workspace) configured with a [serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark) + +## Setup + +### Create an Anomaly Detector resource + +Azure AI Services are represented by Azure resources that you subscribe to. Create a resource for Translator using the [Azure portal](https://docs.microsoft.com/azure/cognitive-services/cognitive-services-apis-create-account?tabs=multiservice%2Clinux) or [Azure CLI](https://learn.microsoft.com/azure/ai-services/multi-service-resource). You can also: + +- View an existing resource in the [Azure portal](https://portal.azure.com/). + +Make note of the endpoint and the key for this resource, you'll need it in this guide. + +## Enter your service keys + +Let's start by adding your key and location. + + +```python +import os +from synapse.ml.core.platform import find_secret + +service_key = find_secret( + secret_name="anomaly-api-key", keyvault="mmlspark-build-keys" +) # Paste your anomaly detector key here +location = "westus2" # Paste your anomaly detector location here +``` + +## Read data into a DataFrame + +Next, let's read the IoTSignals file into a DataFrame. Open a new notebook in your Synapse workspace and create a DataFrame from the file. + + +```python +df_signals = spark.read.csv( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/iot/IoTSignals.csv", + header=True, + inferSchema=True, +) +``` + +### Run anomaly detection using AI services on Spark + +The goal is to find instances where the signals from the IoT devices were outputting anomalous values so that we can see when something is going wrong and do predictive maintenance. To do that, let's use Anomaly Detector on Spark: + + +```python +from pyspark.sql.functions import col, struct +from synapse.ml.services.anomaly import SimpleDetectAnomalies +from synapse.ml.core.spark import FluentAPI + +detector = ( + SimpleDetectAnomalies() + .setSubscriptionKey(service_key) + .setLocation(location) + .setOutputCol("anomalies") + .setGroupbyCol("grouping") + .setSensitivity(95) + .setGranularity("secondly") +) + +df_anomaly = ( + df_signals.where(col("unitSymbol") == "RPM") + .withColumn("timestamp", col("dateTime").cast("string")) + .withColumn("value", col("measureValue").cast("double")) + .withColumn("grouping", struct("deviceId")) + .mlTransform(detector) +).cache() + +df_anomaly.createOrReplaceTempView("df_anomaly") +``` + +Let's take a look at the data: + + +```python +df_anomaly.select("timestamp", "value", "deviceId", "anomalies.isAnomaly").show(3) +``` + +This cell should yield a result that looks like: + +| timestamp | value | deviceId | isAnomaly | +|:--------------------|--------:|:-----------|:------------| +| 2020-05-01 18:33:51 | 3174 | dev-7 | False | +| 2020-05-01 18:33:52 | 2976 | dev-7 | False | +| 2020-05-01 18:33:53 | 2714 | dev-7 | False | + +## Visualize anomalies for one of the devices + +IoTSignals.csv has signals from multiple IoT devices. We'll focus on a specific device and visualize anomalous outputs from the device. + + +```python +df_anomaly_single_device = spark.sql( + """ +select + timestamp, + measureValue, + anomalies.expectedValue, + anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue, + anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue, + case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly +from + df_anomaly +where deviceid = 'dev-1' and timestamp < '2020-04-29' +order by timestamp +limit 200""" +) +``` + +Now that we have created a dataframe that represents the anomalies for a particular device, we can visualize these anomalies: + + +```python +import matplotlib.pyplot as plt +from pyspark.sql.functions import col + +adf = df_anomaly_single_device.toPandas() +adf_subset = df_anomaly_single_device.where(col("isAnomaly") == 1).toPandas() + +plt.figure(figsize=(23, 8)) +plt.plot( + adf["timestamp"], + adf["expectedUpperValue"], + color="darkred", + line, + linewidth=0.25, + label="UpperMargin", +) +plt.plot( + adf["timestamp"], + adf["expectedValue"], + color="darkgreen", + line, + linewidth=2, + label="Expected Value", +) +plt.plot( + adf["timestamp"], + adf["measureValue"], + "b", + color="royalblue", + line, + linewidth=2, + label="Actual", +) +plt.plot( + adf["timestamp"], + adf["expectedLowerValue"], + color="black", + line, + linewidth=0.25, + label="Lower Margin", +) +plt.plot(adf_subset["timestamp"], adf_subset["measureValue"], "ro", label="Anomaly") +plt.legend() +plt.title("RPM Anomalies with Confidence Intervals") +plt.show() +``` + +If successful, your output will look like this: + +![Anomaly Detector Plot](https://github.com/MicrosoftDocs/azure-docs/raw/master/articles/cognitive-services/big-data/media/anomaly-output.png) + +## Next steps + +Learn how to do predictive maintenance at scale with Azure AI services, Azure Synapse Analytics, and Azure CosmosDB. For more information, see the full sample on [GitHub](https://github.com/Azure-Samples/cosmosdb-synapse-link-samples). diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.md new file mode 100644 index 0000000000..b4c241b316 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.md @@ -0,0 +1,505 @@ +--- +title: Quickstart - Isolation Forests +hide_title: true +status: stable +--- +# Multivariate Anomaly Detection with Isolation Forest +This article shows how you can use SynapseML on Apache Spark for multivariate anomaly detection. Multivariate anomaly detection allows for the detection of anomalies among many variables or time series, taking into account all the inter-correlations and dependencies between the different variables. In this scenario, we use SynapseML to train an Isolation Forest model for multivariate anomaly detection, and we then use to the trained model to infer multivariate anomalies within a dataset containing synthetic measurements from three IoT sensors. + +To learn more about the Isolation Forest model please refer to the original paper by [Liu _et al._](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf?q=isolation-forest). + +## Prerequisites + - If running on Synapse, you'll need to [create an AML workspace and set up linked Service](../../Use%20with%20MLFlow/Overview.md) and add the following installation cell. + - If running on Fabric, you need to add the following installation cell and attach the notebook to a lakehouse. On the left side of your notebook, select Add to add an existing lakehouse or create a new one. + + +```python +# %%configure -f +# { +# "name": "synapseml", +# "conf": { +# "spark.jars.packages": "com.microsoft.azure:synapseml_2.12:", +# "spark.jars.repositories": "https://mmlspark.azureedge.net/maven", +# "spark.jars.excludes": "org.scala-lang:scala-reflect,org.apache.spark:spark-tags_2.12,org.scalactic:scalactic_2.12,org.scalatest:scalatest_2.12,com.fasterxml.jackson.core:jackson-databind", +# "spark.yarn.user.classpath.first": "true", +# "spark.sql.parquet.enableVectorizedReader": "false" +# } +# } +``` + + +```python +%pip install sqlparse raiwidgets interpret-community mlflow==2.6.0 numpy==1.22.4 +``` + +## Library imports + + +```python +import uuid +import mlflow + +from pyspark.sql import functions as F +from pyspark.ml.feature import VectorAssembler +from pyspark.sql.types import * +from pyspark.ml import Pipeline + +from synapse.ml.isolationforest import * +from synapse.ml.explainers import * +from synapse.ml.core.platform import * +from synapse.ml.isolationforest import * +``` + + +```python +# %matplotlib inline +``` + +## Input data + + +```python +# Table inputs +timestampColumn = "timestamp" # str: the name of the timestamp column in the table +inputCols = [ + "sensor_1", + "sensor_2", + "sensor_3", +] # list(str): the names of the input variables + +# Training Start time, and number of days to use for training: +trainingStartTime = ( + "2022-02-24T06:00:00Z" # datetime: datetime for when to start the training +) +trainingEndTime = ( + "2022-03-08T23:55:00Z" # datetime: datetime for when to end the training +) +inferenceStartTime = ( + "2022-03-09T09:30:00Z" # datetime: datetime for when to start the training +) +inferenceEndTime = ( + "2022-03-20T23:55:00Z" # datetime: datetime for when to end the training +) + +# Isolation Forest parameters +contamination = 0.021 +num_estimators = 100 +max_samples = 256 +max_features = 1.0 + +# MLFlow experiment +artifact_path = "isolationforest" +model_name = f"isolation-forest-model" + +platform = current_platform() +experiment_name = { + "databricks": f"/Shared/isolation_forest_experiment-{str(uuid.uuid1())}/", + "synapse": f"isolation_forest_experiment-{str(uuid.uuid1())}", + "synapse_internal": f"isolation_forest_experiment-{str(uuid.uuid1())}", # Fabric +}.get(platform, f"isolation_forest_experiment") +``` + +## Read data + + +```python +df = ( + spark.read.format("csv") + .option("header", "true") + .load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/generated_sample_mvad_data.csv" + ) +) +``` + +cast columns to appropriate data types + + +```python +df = ( + df.orderBy(timestampColumn) + .withColumn("timestamp", F.date_format(timestampColumn, "yyyy-MM-dd'T'HH:mm:ss'Z'")) + .withColumn("sensor_1", F.col("sensor_1").cast(DoubleType())) + .withColumn("sensor_2", F.col("sensor_2").cast(DoubleType())) + .withColumn("sensor_3", F.col("sensor_3").cast(DoubleType())) + .drop("_c5") +) + +display(df) +``` + +## Training data preparation + + +```python +# filter to data with timestamps within the training window +df_train = df.filter( + (F.col(timestampColumn) >= trainingStartTime) + & (F.col(timestampColumn) <= trainingEndTime) +) +display(df_train.limit(5)) +``` + +## Test data preparation + + +```python +# filter to data with timestamps within the inference window +df_test = df.filter( + (F.col(timestampColumn) >= inferenceStartTime) + & (F.col(timestampColumn) <= inferenceEndTime) +) +display(df_test.limit(5)) +``` + +## Train Isolation Forest model + + +```python +isolationForest = ( + IsolationForest() + .setNumEstimators(num_estimators) + .setBootstrap(False) + .setMaxSamples(max_samples) + .setMaxFeatures(max_features) + .setFeaturesCol("features") + .setPredictionCol("predictedLabel") + .setScoreCol("outlierScore") + .setContamination(contamination) + .setContaminationError(0.01 * contamination) + .setRandomSeed(1) +) +``` + +Next, we create an ML pipeline to train the Isolation Forest model. We also demonstrate how to create an MLFlow experiment and register the trained model. + +Note that MLFlow model registration is strictly only required if accessing the trained model at a later time. For training the model, and performing inferencing in the same notebook, the model object model is sufficient. + + +```python +if running_on_synapse(): + from synapse.ml.core.platform import find_secret + + tracking_url = find_secret( + secret_name="aml-mlflow-tracking-url", keyvault="mmlspark-build-keys" + ) # check link in prerequisites for more information on mlflow tracking url + mlflow.set_tracking_uri(tracking_url) +``` + + +```python +mlflow.set_experiment(experiment_name) +with mlflow.start_run() as run: + va = VectorAssembler(inputCols=inputCols, outputCol="features") + pipeline = Pipeline(stages=[va, isolationForest]) + model = pipeline.fit(df_train) + mlflow.spark.log_model( + model, artifact_path=artifact_path, registered_model_name=model_name + ) +``` + +## Perform inferencing + +Load the trained Isolation Forest Model + + +```python +# if running_on_databricks(): +# model_version = +# model_uri = f"models:/{model_name}/{model_version}" +# elif running_on_synapse_internal(): +# model_uri = "runs:/{run_id}/{artifact_path}".format( +# run_id=run.info.run_id, artifact_path=artifact_path +# ) +# model = mlflow.spark.load_model(model_uri) +``` + +Perform inferencing + + +```python +df_test_pred = model.transform(df_test) +display(df_test_pred.limit(5)) +``` + +## ML interpretability +In this section, we use ML interpretability tools to help unpack the contribution of each sensor to the detected anomalies at any point in time. + + +```python +# Here, we create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column +# we are trying to explain. In this case, we are trying to explain the "outlierScore" output. +shap = TabularSHAP( + inputCols=inputCols, + outputCol="shapValues", + model=model, + targetCol="outlierScore", + backgroundData=F.broadcast(df_test.sample(0.02)), +) +``` + +Display the dataframe with `shapValues` column + + +```python +shap_df = shap.transform(df_test_pred) +``` + + +```python +# Define UDF +vec2array = F.udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType())) +``` + + +```python +# Here, we extract the SHAP values, the original features and the outlier score column. Then we convert it to a Pandas DataFrame for visualization. +# For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset), +# and each of the following elements represents the SHAP values for each feature +shaps = ( + shap_df.withColumn("shapValues", vec2array(F.col("shapValues").getItem(0))) + .select( + ["shapValues", "outlierScore"] + inputCols + [timestampColumn, "predictedLabel"] + ) + .withColumn("sensor_1_localimp", F.col("shapValues")[1]) + .withColumn("sensor_2_localimp", F.col("shapValues")[2]) + .withColumn("sensor_3_localimp", F.col("shapValues")[3]) +) +``` + + +```python +shaps_local = shaps.toPandas() +shaps_local +``` + +Retrieve local feature importances + + +```python +local_importance_values = shaps_local[["shapValues"]] +eval_data = shaps_local[inputCols] +``` + + +```python +# Removing the first element in the list of local importance values (this is the base value or mean output of the background dataset) +list_local_importance_values = local_importance_values.values.tolist() +converted_importance_values = [] +bias = [] +for classarray in list_local_importance_values: + for rowarray in classarray: + converted_list = rowarray.tolist() + bias.append(converted_list[0]) + # remove the bias from local importance values + del converted_list[0] + converted_importance_values.append(converted_list) +``` + + +```python +from interpret_community.adapter import ExplanationAdapter + +adapter = ExplanationAdapter(inputCols, classification=False) +global_explanation = adapter.create_global( + converted_importance_values, eval_data, expected_values=bias +) +``` + + +```python +# view the global importance values +global_explanation.global_importance_values +``` + + +```python +# view the local importance values +global_explanation.local_importance_values +``` + + +```python +# Defining a wrapper class with predict method for creating the Explanation Dashboard + + +class wrapper(object): + def __init__(self, model): + self.model = model + + def predict(self, data): + sparkdata = spark.createDataFrame(data) + return ( + model.transform(sparkdata) + .select("outlierScore") + .toPandas() + .values.flatten() + .tolist() + ) +``` + +## Visualize results + +Visualize anomaly results and feature contribution scores (derived from local feature importance) + + +```python +import matplotlib.pyplot as plt + + +def visualize(rdf): + anoms = list(rdf["predictedLabel"] == 1) + + fig = plt.figure(figsize=(26, 12)) + + ax = fig.add_subplot(611) + ax.title.set_text(f"Multivariate Anomaly Detection Results") + ax.plot( + rdf[timestampColumn], + rdf["sensor_1"], + color="tab:orange", + line, + linewidth=2, + label="sensor_1", + ) + ax.grid(axis="y") + _, _, ymin, ymax = plt.axis() + ax.vlines( + rdf[timestampColumn][anoms], + ymin=ymin, + ymax=ymax, + color="tab:red", + alpha=0.2, + linewidth=6, + ) + ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False) + ax.set_ylabel("sensor1_value") + ax.legend() + + ax = fig.add_subplot(612, sharex=ax) + ax.plot( + rdf[timestampColumn], + rdf["sensor_2"], + color="tab:green", + line, + linewidth=2, + label="sensor_2", + ) + ax.grid(axis="y") + _, _, ymin, ymax = plt.axis() + ax.vlines( + rdf[timestampColumn][anoms], + ymin=ymin, + ymax=ymax, + color="tab:red", + alpha=0.2, + linewidth=6, + ) + ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False) + ax.set_ylabel("sensor2_value") + ax.legend() + + ax = fig.add_subplot(613, sharex=ax) + ax.plot( + rdf[timestampColumn], + rdf["sensor_3"], + color="tab:purple", + line, + linewidth=2, + label="sensor_3", + ) + ax.grid(axis="y") + _, _, ymin, ymax = plt.axis() + ax.vlines( + rdf[timestampColumn][anoms], + ymin=ymin, + ymax=ymax, + color="tab:red", + alpha=0.2, + linewidth=6, + ) + ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False) + ax.set_ylabel("sensor3_value") + ax.legend() + + ax = fig.add_subplot(614, sharex=ax) + ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False) + ax.plot( + rdf[timestampColumn], + rdf["outlierScore"], + color="black", + line, + linewidth=2, + label="Outlier score", + ) + ax.set_ylabel("outlier score") + ax.grid(axis="y") + ax.legend() + + ax = fig.add_subplot(615, sharex=ax) + ax.tick_params(axis="x", which="both", bottom=False, labelbottom=False) + ax.bar( + rdf[timestampColumn], + rdf["sensor_1_localimp"].abs(), + width=2, + color="tab:orange", + label="sensor_1", + ) + ax.bar( + rdf[timestampColumn], + rdf["sensor_2_localimp"].abs(), + width=2, + color="tab:green", + label="sensor_2", + bottom=rdf["sensor_1_localimp"].abs(), + ) + ax.bar( + rdf[timestampColumn], + rdf["sensor_3_localimp"].abs(), + width=2, + color="tab:purple", + label="sensor_3", + bottom=rdf["sensor_1_localimp"].abs() + rdf["sensor_2_localimp"].abs(), + ) + ax.set_ylabel("Contribution scores") + ax.grid(axis="y") + ax.legend() + + plt.show() +``` + + +```python +visualize(shaps_local) +``` + +When you run the cell above, you will see the following plots: + +![](https://mmlspark.blob.core.windows.net/graphics/notebooks/mvad_results_local_importances.jpg) + +- The first 3 plots above show the sensor time series data in the inference window, in orange, green, purple and blue. The red vertical lines show the detected anomalies (`prediction` = 1). +- The fourth plot shows the outlierScore of all the points, with the `minOutlierScore` threshold shown by the dotted red horizontal line. +- The last plot shows the contribution scores of each sensor to the `outlierScore` for that point. + +Plot aggregate feature importance + + +```python +plt.figure(figsize=(10, 7)) +plt.bar(inputCols, global_explanation.global_importance_values) +plt.ylabel("global importance values") +``` + +When you run the cell above, you will see the following global feature importance plot: + +![](https://mmlspark.blob.core.windows.net/graphics/notebooks/global-feature-importance.jpg) + +Visualize the explanation in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets. + + +```python +# View the model explanation in the ExplanationDashboard +from raiwidgets import ExplanationDashboard + +ExplanationDashboard(global_explanation, wrapper(model), dataset=eval_data) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Overview.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Overview.md new file mode 100644 index 0000000000..2d5384d4de --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Overview.md @@ -0,0 +1,61 @@ +--- +title: Overview +hide_title: true +sidebar_label: Overview +--- + +## Causal Inference on Apache Spark + +### What is Causal Inference? +One challenge that has taken the spotlight in recent years is using machine learning to drive decision makings in policy and business. +Often, businesses and policymakers would like to study whether an incentive or intervention will lead to a desired outcome and by how much. +For example, if we give customers a discount (treatment), how much more will they purchase in the future (outcome). +Traditionally, people use correlation analysis or prediction model to understand correlated factors, but going from prediction to an +impactful decision isn't always straightforward as correlation doesn't imply causation. In many cases, confounding variables influence +both the probability of treatment and the outcome, introducing more non-causal correlation. + +Causal inference helps to bridge the gap between prediction and decision-making. + +### Causal Inference language +| Term | Example | +|-----------------|--------------------------------------------------------------------| +| Treatment (T) | Seeing an advertisement | +| Outcome (Y) | Probability of buying a specific new game | +| Confounders (W) | Current gaming habits, past purchases, customer location, platform | + +### Causal Inference and Double machine learning +The gold standard approach to isolating causal questions is to run an experiment that randomly assigns the treatment to some customers. +Randomization eliminates any relationship between the confounders and the probability of treatment, +so any differences between treated and untreated customers can only reflect the direct causal effect of the treatment on the outcome (treatment effect). +However, in many cases, treatments experiments are either impossible or cost prohibitive. +As a result, we look toward causal inference methods that allow us to estimate the treatment effect using observational data. + +The SynapseML causal package implements a technique "Double machine learning", which can be used to estimate the average treatment effect via machine learning models. +Unlike regression-based approaches that make strict parametric assumptions, this machine learning-based approach allows us to model non-linear relationships between the confounders, treatment, and outcome. + +### Usage +In PySpark, you can run the `DoubleMLEstimator` via: + +```python +from pyspark.ml.classification import LogisticRegression +from synapse.ml.causal import DoubleMLEstimator +dml = (DoubleMLEstimator() + .setTreatmentCol("Treatment") + .setTreatmentModel(LogisticRegression()) + .setOutcomeCol("Outcome") + .setOutcomeModel(LogisticRegression()) + .setMaxIter(20)) +dmlModel = dml.fit(dataset) +``` +> Note: all columns except "Treatment" and "Outcome" in your dataset will be used as confounders. + +> Note: For discrete treatment, the treatment column must be `int` or `bool`. `0` and `False` will be treated as the control group. + +After fitting the model, you can get average treatment effect and confidence interval: +```python +dmlModel.getAvgTreatmentEffect() +dmlModel.getConfidenceInterval() +``` + +For an end to end application, check out the DoubleMLEstimator [notebook +example](../Quickstart%20-%20Measure%20Causal%20Effects). diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.md new file mode 100644 index 0000000000..306b76d1c9 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Causal Effects.md @@ -0,0 +1,91 @@ +--- +title: Quickstart - Measure Causal Effects +hide_title: true +status: stable +--- +# Startup Investment Attribution - Understand Outreach Effort's Effect + +**This sample notebook aims to show the application of using SynapseML's DoubleMLEstimator for inferring causality using observational data.** + +A startup that sells software would like to know whether its outreach efforts were successful in attracting new customers or boosting consumption among existing customers. In other words, they would like to learn the treatment effect of each investment on customers' software usage. + +In an ideal world, the startup would run several randomized experiments where each customer would receive a random assortment of investments. However, this can be logistically prohibitive or strategically unsound: the startup might not have the resources to design such experiments or they might not want to risk losing out on big opportunities due to lack of incentives. + +In this customer scenario walkthrough, we show how SynapseML causal package can use historical investment data to learn the investment effect. + +## Background +In this scenario, a startup that sells software provides discounts incentives to its customer. A customer might be given or not. + +The startup has historical data on these investments for 2,000 customers, as well as how much revenue these customers generated in the year after the investments were made. They would like to use this data to learn the optimal incentive policy for each existing or new customer in order to maximize the return on investment (ROI). + +The startup faces a challenge: the dataset is biased because historically the larger customers received the most incentives. Thus, they need a causal model that can remove the bias. + +## Data +The data* contains ~2,000 customers and is comprised of: + +* Customer features: details about the industry, size, revenue, and technology profile of each customer. +* Interventions: information about which incentive was given to a customer. +* Outcome: the amount of product the customer bought in the year after the incentives were given. + + +| Feature Name | Type | Details | +|-----------------|------|---------------------------------------------------------------------------------------------------------------------------------------------| +| Global Flag | W | whether the customer has global offices | +| Major Flag | W | whether the customer is a large consumer in their industry (as opposed to SMC - Small Medium Corporation - or SMB - Small Medium Business) | +| SMC Flag | W | whether the customer is a Small Medium Corporation (SMC, as opposed to major and SMB) | +| Commercial Flag | W | whether the customer's business is commercial (as opposed to public secor) | +| IT Spend | W | $ spent on IT-related purchases | +| Employee Count | W | number of employees | +| PC Count | W | number of PCs used by the customer | | +| Discount | T | whether the customer was given a discount (binary) | +| Revenue | Y | $ Revenue from customer given by the amount of software purchased | + + + +```python +# Import the sample multi-attribution data +data = ( + spark.read.format("csv") + .option("inferSchema", True) + .option("header", True) + .load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/multi_attribution_sample.csv" + ) +) +``` + +## Get Causal Effects with SynapseML DoubleMLEstimator + + +```python +from synapse.ml.causal import * +from pyspark.ml.classification import LogisticRegression +from pyspark.ml.regression import LinearRegression + +treatmentColumn = "Discount" +outcomeColumn = "Revenue" + +dml = ( + DoubleMLEstimator() + .setTreatmentModel(LogisticRegression()) + .setTreatmentCol(treatmentColumn) + .setOutcomeModel(LinearRegression()) + .setOutcomeCol(outcomeColumn) + .setMaxIter(20) +) + +model = dml.fit(data) +``` + + +```python +# Get average treatment effect, it returns a numeric value, e.g. 5166.78324 +# It means, on average, customers who received a discount spent $5,166 more on software +model.getAvgTreatmentEffect() +``` + + +```python +# Get treatment effect's confidence interval, e.g. [4765.826181160708, 5371.2817538168965] +model.getConfidenceInterval() +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.md new file mode 100644 index 0000000000..b10f9c3592 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Measure Heterogeneous Effects.md @@ -0,0 +1,143 @@ +--- +title: Quickstart - Measure Heterogeneous Effects +hide_title: true +status: stable +--- +# Startup Investment Attribution - Understand Outreach Effort's Effect" + +**This sample notebook aims to show the application of using SynapseML's DoubleMLEstimator for inferring causality using observational data.** + +A startup that sells software would like to know whether its outreach efforts were successful in attracting new customers or boosting consumption among existing customers. In other words, they would like to learn the treatment effect of each investment on customers' software usage. + +In an ideal world, the startup would run several randomized experiments where each customer would receive a random assortment of investments. However, this can be logistically prohibitive or strategically unsound: the startup might not have the resources to design such experiments or they might not want to risk losing out on big opportunities due to lack of incentives. + +In this customer scenario walkthrough, we show how SynapseML causal package can use historical investment data to learn the investment effect. + +## Background +In this scenario, a startup that sells software provides discounts incentives to its customer. A customer might be given or not. + +The startup has historical data on these investments for 2,000 customers, as well as how much revenue these customers generated in the year after the investments were made. They would like to use this data to learn the optimal incentive policy for each existing or new customer in order to maximize the return on investment (ROI). + +The startup faces a challenge: the dataset is biased because historically the larger customers received the most incentives. Thus, they need a causal model that can remove the bias. + +## Data +The data* contains ~2,000 customers and is comprised of: + +* Customer features: details about the industry, size, revenue, and technology profile of each customer. +* Interventions: information about which incentive was given to a customer. +* Outcome: the amount of product the customer bought in the year after the incentives were given. + + +| Feature Name | Type | Details | +|-----------------|------|---------------------------------------------------------------------------------------------------------------------------------------------| +| Global Flag | W | whether the customer has global offices | +| Major Flag | W | whether the customer is a large consumer in their industry (as opposed to SMC - Small Medium Corporation - or SMB - Small Medium Business) | +| SMC Flag | W | whether the customer is a Small Medium Corporation (SMC, as opposed to major and SMB) | +| Commercial Flag | W | whether the customer's business is commercial (as opposed to public secor) | +| IT Spend | W | dollar spent on IT-related purchases | +| Employee Count | W | number of employees | +| PC Count | W | number of PCs used by the customer | | +| Size | X | customer's size given by their yearly total revenue | | +| Discount | T | whether the customer was given a discount (binary) | +| Revenue | Y | $ Revenue from customer given by the amount of software purchased | + + + +```python +# Import the sample multi-attribution data +data = ( + spark.read.format("csv") + .option("inferSchema", True) + .option("header", True) + .load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/multi_attribution_sample.csv" + ) +) +``` + +# Get Heterogenous Causal Effects with SynapseML OrthoDML Estimator + + +```python +data.columns +``` + + +```python +from synapse.ml.causal import * +from pyspark.ml import Pipeline +from synapse.ml.causal import * +from pyspark.ml.feature import VectorAssembler +from pyspark.sql.types import IntegerType, BooleanType, DateType, DoubleType +import matplotlib.pyplot as plt +``` + + +```python +treatmentColumn = "Discount" +outcomeColumn = "Revenue" +confounderColumns = [ + "Global Flag", + "Major Flag", + "SMC Flag", + "Commercial Flag", + "Employee Count", + "PC Count", +] +heteroColumns = ["Size", "IT Spend"] +heterogeneityVecCol = "XVec" +confounderVecCol = "XWVec" + +data = data.withColumn(treatmentColumn, data.Discount.cast(DoubleType())) + +heterogeneityVector = VectorAssembler( + inputCols=heteroColumns, outputCol=heterogeneityVecCol +) + +confounderVector = VectorAssembler( + inputCols=confounderColumns, outputCol=confounderVecCol +) + +pipeline = Pipeline(stages=[heterogeneityVector, confounderVector]) + +ppfit = pipeline.fit(data).transform(data) +``` + + +```python +### Create the Ortho Forest DML Estimator Model +mtTransform = ( + OrthoForestDMLEstimator() + .setNumTrees(100) + .setTreatmentCol(treatmentColumn) + .setOutcomeCol(outcomeColumn) + .setHeterogeneityVecCol(heterogeneityVecCol) + .setConfounderVecCol(confounderVecCol) + .setMaxDepth(10) + .setMinSamplesLeaf(10) +) +``` + + +```python +### Fit the model for the data +finalModel = mtTransform.fit(ppfit) +``` + + +```python +### Transform the input data to see the model in action +finalPred = finalModel.transform(ppfit) +``` + + +```python +### Get the data in Pandas +pd_final = finalPred.toPandas() +``` + + +```python +### Plot and see the non-linear effects +plt.scatter("Size", mtTransform.getOutputCol(), data=pd_final) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Synthetic difference in differences.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Synthetic difference in differences.md new file mode 100644 index 0000000000..1341bbdfc2 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Causal Inference/Quickstart - Synthetic difference in differences.md @@ -0,0 +1,231 @@ +--- +title: Quickstart - Synthetic difference in differences +hide_title: true +status: stable +--- +# Scalable Synthetic Difference in Differences + +This sample notebook aims to show readers how to use SynapseML's `DiffInDiffEstimator`, `SyntheticControlEstimator` and `SyntheticDiffInDiffEstimator` to estimate the causal effect of a treatment on a particular outcome. + +In this sample notebook, we will use the California smoking cessation program example to demonstrate usage of the SyntheticDiffInDiff Estimator. The goal of the analysis is to estimate the effect of increased cigarette taxes on smoking in California. + + +```python +from pyspark.sql.types import * +from synapse.ml.causal import ( + DiffInDiffEstimator, + SyntheticControlEstimator, + SyntheticDiffInDiffEstimator, +) +from matplotlib import pyplot as plt +from matplotlib import style +import pandas as pd +import numpy as np + +spark.sparkContext.setLogLevel("INFO") +style.use("ggplot") +``` + +We will select 5 columns from the dataset: state, year, cigsale, california, after_treatment. + + +```python +df = ( + spark.read.option("header", True) + .option("inferSchema", True) + .csv("wasbs://publicwasb@mmlspark.blob.core.windows.net/smoking.csv") + .select("state", "year", "cigsale", "california", "after_treatment") +) +display(df) +``` + +First, we use the `DiffInDiffEstimator` to estimate the causal effect with regular difference in differences method. We set the treatment indicator column to "california", set post-treatment indicator column to "after_treatment", and set the outcome column to "cigsale". + + +```python +estimator1 = DiffInDiffEstimator( + treatmentCol="california", postTreatmentCol="after_treatment", outcomeCol="cigsale" +) + +model1 = estimator1.fit(df) + +print("[Diff in Diff] treatment effect: {}".format(model1.treatmentEffect)) +print("[Diff in Diff] standard error: {}".format(model1.standardError)) +``` + +The treatment effect estimated by difference in differences should be -27.349. + +Next, we use `SyntheticControlEstimator` to synthesize a control unit and use the synthetic control to estimate the causal effect. To create the synthetic control unit, we need to set the column which indicates the time when each outcome is measured, and the column which indicates the unit for which the outcome is measured. + + +```python +estimator2 = SyntheticControlEstimator( + timeCol="year", + unitCol="state", + treatmentCol="california", + postTreatmentCol="after_treatment", + outcomeCol="cigsale", + maxIter=5000, + numIterNoChange=50, + tol=1e-4, + stepSize=1.0, +) + +model2 = estimator2.fit(df) + +print("[Synthetic Control] treatment effect: {}".format(model2.treatmentEffect)) +print("[Synthetic Control] standard error: {}".format(model2.standardError)) +``` + +The treatment effect estimated by synthetic control should be about -19.354. + +Internally, a constrained least square regression is used to solve the unit weights for the synthetic control, and we can plot the loss history. + + +```python +lossHistory = pd.Series(np.array(model2.lossHistoryUnitWeights)) + +plt.plot(lossHistory[2000:]) +plt.title("loss history - unit weights") +plt.xlabel("Iteration") +plt.ylabel("Loss") +plt.show() + +print("Mimimal loss: {}".format(lossHistory.min())) +``` + +We can also visualize the synthetic control and compare it with the treated unit. + + +```python +sc_weights = model2.unitWeights.toPandas().set_index("state") +pdf = df.toPandas() +sc = ( + pdf.query("~california") + .pivot(index="year", columns="state", values="cigsale") + .dot(sc_weights) +) +plt.plot(sc, label="Synthetic Control") +plt.plot(sc.index, pdf.query("california")["cigsale"], label="California", color="C1") + +plt.title("Synthetic Control Estimation") +plt.ylabel("Cigarette Sales") +plt.vlines( + x=1988, + ymin=40, + ymax=140, + line, + lw=2, + label="Proposition 99", + color="black", +) +plt.legend() +``` + +Lastly, we use `SyntheticDiffInDiffEstimator` to estimate the causal effect. + + +```python +estimator3 = SyntheticDiffInDiffEstimator( + timeCol="year", + unitCol="state", + treatmentCol="california", + postTreatmentCol="after_treatment", + outcomeCol="cigsale", + maxIter=5000, + numIterNoChange=50, + tol=1e-4, + stepSize=1.0, +) + +model3 = estimator3.fit(df) + +print("[Synthetic Diff in Diff] treatment effect: {}".format(model3.treatmentEffect)) +print("[Synthetic Diff in Diff] standard error: {}".format(model3.standardError)) +``` + +The treatment effect estimated by synthetic control should be about -15.554. + +Again, we can plot the loss history from the optimizer used to solve the unit weights and the time weights. + + +```python +lossHistory = pd.Series(np.array(model3.lossHistoryUnitWeights)) + +plt.plot(lossHistory[1000:]) +plt.title("loss history - unit weights") +plt.xlabel("Iteration") +plt.ylabel("Loss") +plt.show() + +print("Mimimal loss: {}".format(lossHistory.min())) +``` + + +```python +lossHistory = pd.Series(np.array(model3.lossHistoryTimeWeights)) + +plt.plot(lossHistory[1000:]) +plt.title("loss history - time weights") +plt.xlabel("Iteration") +plt.ylabel("Loss") +plt.show() + +print("Mimimal loss: {}".format(lossHistory.min())) +``` + +Here we plot the synthetic diff in diff estimate together with the time weights. + + +```python +unit_weights = model3.unitWeights.toPandas().set_index("state") +unit_intercept = model3.unitIntercept + +time_weights = model3.timeWeights.toPandas().set_index("year") +time_intercept = model3.timeIntercept + +pdf = df.toPandas() +pivot_df_control = pdf.query("~california").pivot( + index="year", columns="state", values="cigsale" +) +pivot_df_treat = pdf.query("california").pivot( + index="year", columns="state", values="cigsale" +) +sc_did = pivot_df_control.values @ unit_weights.values +treated_mean = pivot_df_treat.mean(axis=1) +``` + + +```python +fig, (ax1, ax2) = plt.subplots( + 2, 1, figsize=(15, 8), sharex=True, gridspec_kw={"height_ratios": [3, 1]} +) +fig.suptitle("Synthetic Diff in Diff Estimation") + +ax1.plot( + pivot_df_control.mean(axis=1), lw=3, color="C1", ls="dashed", label="Control Avg." +) +ax1.plot(treated_mean, lw=3, color="C0", label="California") +ax1.plot( + pivot_df_control.index, + sc_did, + label="Synthetic Control (SDID)", + color="C1", + alpha=0.8, +) +ax1.set_ylabel("Cigarette Sales") +ax1.vlines( + 1989, + treated_mean.min(), + treated_mean.max(), + color="black", + ls="dotted", + label="Prop. 99", +) +ax1.legend() + +ax2.bar(time_weights.index, time_weights["value"], color="skyblue") +ax2.set_ylabel("Time Weights") +ax2.set_xlabel("Time") +ax2.vlines(1989, 0, 1, color="black", ls="dotted") +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.md new file mode 100644 index 0000000000..c7f9845616 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - SparkML vs SynapseML.md @@ -0,0 +1,216 @@ +--- +title: Quickstart - SparkML vs SynapseML +hide_title: true +status: stable +--- +# Classification - SparkML vs SynapseML + +


+ +In this article, you perform the same classification task in two +different ways: once using plain **`pyspark`** and once using the +**`synapseml`** library. The two methods yield the same performance, +but highlights the simplicity of using `synapseml` compared to `pyspark`. + +The task is to predict whether a customer's review of a book sold on +Amazon is good (rating > 3) or bad based on the text of the review. You +accomplish it by training LogisticRegression learners with different +hyperparameters and choosing the best model. + +## Setup +Import necessary Python libraries and get a spark session. + +## Read the data + +Download and read in the data. + + +```python +rawData = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet" +) +rawData.show(5) +``` + +## Extract features and process data + +Real data is more complex than the above dataset. It's common +for a dataset to have features of multiple types, such as text, numeric, and +categorical. To illustrate how difficult it's to work with these +datasets, add two numerical features to the dataset: the **word count** of the review and the **mean word length**. + + +```python +from pyspark.sql.functions import udf +from pyspark.sql.types import * + + +def wordCount(s): + return len(s.split()) + + +def wordLength(s): + import numpy as np + + ss = [len(w) for w in s.split()] + return round(float(np.mean(ss)), 2) + + +wordLengthUDF = udf(wordLength, DoubleType()) +wordCountUDF = udf(wordCount, IntegerType()) +``` + + +```python +from synapse.ml.stages import UDFTransformer + +wordLength = "wordLength" +wordCount = "wordCount" +wordLengthTransformer = UDFTransformer( + inputCol="text", outputCol=wordLength, udf=wordLengthUDF +) +wordCountTransformer = UDFTransformer( + inputCol="text", outputCol=wordCount, udf=wordCountUDF +) +``` + + +```python +from pyspark.ml import Pipeline + +data = ( + Pipeline(stages=[wordLengthTransformer, wordCountTransformer]) + .fit(rawData) + .transform(rawData) + .withColumn("label", rawData["rating"] > 3) + .drop("rating") +) +``` + + +```python +data.show(5) +``` + +## Classify using pyspark + +To choose the best LogisticRegression classifier using the `pyspark` +library, we need to *explicitly* perform the following steps: + +1. Process the features: + - Tokenize the text column + - Hash the tokenized column into a vector using hashing + - Merge the numeric features with the vector +2. Process the label column: cast it into the proper type. +3. Train multiple LogisticRegression algorithms on the `train` dataset + with different hyperparameters +4. Compute the area under the ROC curve for each of the trained models + and select the model with the highest metric as computed on the + `test` dataset +5. Evaluate the best model on the `validation` set + + +```python +from pyspark.ml.feature import Tokenizer, HashingTF +from pyspark.ml.feature import VectorAssembler + +# Featurize text column +tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText") +numFeatures = 10000 +hashingScheme = HashingTF( + inputCol="tokenizedText", outputCol="TextFeatures", numFeatures=numFeatures +) +tokenizedData = tokenizer.transform(data) +featurizedData = hashingScheme.transform(tokenizedData) + +# Merge text and numeric features in one feature column +featureColumnsArray = ["TextFeatures", "wordCount", "wordLength"] +assembler = VectorAssembler(inputCols=featureColumnsArray, outputCol="features") +assembledData = assembler.transform(featurizedData) + +# Select only columns of interest +# Convert rating column from boolean to int +processedData = assembledData.select("label", "features").withColumn( + "label", assembledData.label.cast(IntegerType()) +) +``` + + +```python +from pyspark.ml.evaluation import BinaryClassificationEvaluator +from pyspark.ml.classification import LogisticRegression + +# Prepare data for learning +train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123) + +# Train the models on the 'train' data +lrHyperParams = [0.05, 0.1, 0.2, 0.4] +logisticRegressions = [ + LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams +] +evaluator = BinaryClassificationEvaluator( + rawPredictionCol="rawPrediction", metricName="areaUnderROC" +) +metrics = [] +models = [] + +# Select the best model +for learner in logisticRegressions: + model = learner.fit(train) + models.append(model) + scoredData = model.transform(test) + metrics.append(evaluator.evaluate(scoredData)) +bestMetric = max(metrics) +bestModel = models[metrics.index(bestMetric)] + +# Get AUC on the validation dataset +scoredVal = bestModel.transform(validation) +print(evaluator.evaluate(scoredVal)) +``` + +## Classify using SynapseML + +The steps needed with `synapseml` are simpler: + +1. The **`TrainClassifier`** Estimator featurizes the data internally, + as long as the columns selected in the `train`, `test`, `validation` + dataset represent the features + +2. The **`FindBestModel`** Estimator finds the best model from a pool of + trained models by finding the model that performs best on the `test` + dataset given the specified metric + +3. The **`ComputeModelStatistics`** Transformer computes the different + metrics on a scored dataset (in our case, the `validation` dataset) + at the same time + + +```python +from synapse.ml.train import TrainClassifier, ComputeModelStatistics +from synapse.ml.automl import FindBestModel + +# Prepare data for learning +train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123) + +# Train the models on the 'train' data +lrHyperParams = [0.05, 0.1, 0.2, 0.4] +logisticRegressions = [ + LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams +] +lrmodels = [ + TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train) + for lrm in logisticRegressions +] + +# Select the best model +bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test) + + +# Get AUC on the validation dataset +predictions = bestModel.transform(validation) +metrics = ComputeModelStatistics().transform(predictions) +print( + "Best model's AUC on validation set = " + + "{0:.2f}%".format(metrics.first()["AUC"] * 100) +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Train Classifier.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Train Classifier.md new file mode 100644 index 0000000000..d598de664f --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Train Classifier.md @@ -0,0 +1,55 @@ +--- +title: Quickstart - Train Classifier +hide_title: true +status: stable +--- +## Classification - Adult Census + +In this example, we try to predict incomes from the *Adult Census* dataset. + +First, we import the packages (use `help(synapse)` to view contents), + +Now let's read the data and split it to train and test sets: + + +```python +data = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet" +) +data = data.select(["education", "marital-status", "hours-per-week", "income"]) +train, test = data.randomSplit([0.75, 0.25], seed=123) +train.limit(10).toPandas() +``` + +`TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers. +You can use `help(synapse.ml.train.TrainClassifier)` to view the different parameters. + +Note that it implicitly converts the data into the format expected by the algorithm: tokenize +and hash strings, one-hot encodes categorical variables, assembles the features into a vector +and so on. The parameter `numFeatures` controls the number of hashed features. + + +```python +from synapse.ml.train import TrainClassifier +from pyspark.ml.classification import LogisticRegression + +model = TrainClassifier( + model=LogisticRegression(), labelCol="income", numFeatures=256 +).fit(train) +``` + +Finally, we save the model so it can be used in a scoring program. + + +```python +from synapse.ml.core.platform import * + +if running_on_synapse() or running_on_synapse_internal(): + model.write().overwrite().save("Files/models/AdultCensus.mml") +elif running_on_databricks(): + model.write().overwrite().save("dbfs:/AdultCensus.mml") +elif running_on_binder(): + model.write().overwrite().save("/tmp/AdultCensus.mml") +else: + print(f"{current_platform()} platform not supported") +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.md new file mode 100644 index 0000000000..07d35d1984 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Tabular Data.md @@ -0,0 +1,82 @@ +--- +title: Quickstart - Vowpal Wabbit on Tabular Data +hide_title: true +status: stable +--- +# Classification - Adult Census using Vowpal Wabbit in SynapseML + +In this example, we predict incomes from the *Adult Census* dataset using Vowpal Wabbit (VW) classifier in SynapseML. +First, we read the data and split it into train and test sets as in this [example](https://github.com/Microsoft/SynapseML/blob/master/notebooks/Classification%20-%20Adult%20Census.ipynb +). + + +```python +data = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet" +) +data = data.select(["education", "marital-status", "hours-per-week", "income"]) +train, test = data.randomSplit([0.75, 0.25], seed=123) +train.limit(10).toPandas() +``` + +Next, we define a pipeline that includes feature engineering and training of a VW classifier. We use a featurizer provided by VW that hashes the feature names. +Note that VW expects classification labels being -1 or 1. Thus, the income category is mapped to this space before feeding training data into the pipeline. + + +```python +from pyspark.sql.functions import when, col +from pyspark.ml import Pipeline +from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier + +# Define classification label +train = ( + train.withColumn("label", when(col("income").contains("<"), 0.0).otherwise(1.0)) + .repartition(1) + .cache() +) +print(train.count()) + +# Specify featurizer +vw_featurizer = VowpalWabbitFeaturizer( + inputCols=["education", "marital-status", "hours-per-week"], outputCol="features" +) + +# Define VW classification model +args = "--loss_function=logistic --quiet --holdout_off" +vw_model = VowpalWabbitClassifier( + featuresCol="features", labelCol="label", passThroughArgs=args, numPasses=10 +) + +# Create a pipeline +vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model]) +``` + +Then, we are ready to train the model by fitting the pipeline with the training data. + + +```python +# Train the model +vw_trained = vw_pipeline.fit(train) +``` + +After the model is trained, we apply it to predict the income of each sample in the test set. + + +```python +# Making predictions +test = test.withColumn("label", when(col("income").contains("<"), 0.0).otherwise(1.0)) +prediction = vw_trained.transform(test) +prediction.limit(10).toPandas() +``` + +Finally, we evaluate the model performance using `ComputeModelStatistics` function which will compute confusion matrix, accuracy, precision, recall, and AUC by default for classification models. + + +```python +from synapse.ml.train import ComputeModelStatistics + +metrics = ComputeModelStatistics( + evaluationMetric="classification", labelCol="label", scoredLabelsCol="prediction" +).transform(prediction) +metrics.toPandas() +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.md new file mode 100644 index 0000000000..6671a1b347 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Classification/Quickstart - Vowpal Wabbit on Text Data.md @@ -0,0 +1,213 @@ +--- +title: Quickstart - Vowpal Wabbit on Text Data +hide_title: true +status: stable +--- +# Twitter Sentiment Classification using Vowpal Wabbit in SynapseML + +In this example, we show how to build a sentiment classification model using Vowpal Wabbit (VW) in SynapseML. The data set we use to train and evaluate the model is [Sentiment140](http://help.sentiment140.com/for-students/?source=post_page---------------------------) twitter data. First, we import a few packages that we need. + + +```python +import os +import urllib.request +import pandas as pd +from zipfile import ZipFile +from pyspark.sql.functions import udf, rand, when, col +from pyspark.ml import Pipeline +from pyspark.ml.feature import CountVectorizer, RegexTokenizer +from synapse.ml.vw import VowpalWabbitClassifier +from synapse.ml.train import ComputeModelStatistics +from pyspark.mllib.evaluation import BinaryClassificationMetrics +import matplotlib.pyplot as plt +``` + + +```python +# URL to download the sentiment140 dataset and data file names +DATA_URL = "https://mmlspark.blob.core.windows.net/publicwasb/twittersentimenttrainingandtestdata.zip" +TRAIN_FILENAME = "training.1600000.processed.noemoticon.csv" +TEST_FILENAME = "testdata.manual.2009.06.14.csv" +# Folder for storing the downloaded data +DATA_FOLDER = "data" +# Data column names +COL_NAMES = ["label", "id", "date", "query_string", "user", "text"] +# Text encoding type of the data +ENCODING = "iso-8859-1" +``` + +## Data Preparation + +We use [Sentiment140](http://help.sentiment140.com/for-students/?source=post_page---------------------------) twitter data which originated from a Stanford research project to train and evaluate VW classification model on Spark. The same dataset has been used in a previous [Azure Machine Learning sample](https://github.com/Azure-Samples/MachineLearningSamples-TwitterSentimentPrediction) on twitter sentiment prediction. Before using the data to build the classification model, we first download and clean up the data. + + +```python +def download_data(url, data_folder=DATA_FOLDER, filename="downloaded_data.zip"): + """Download and extract data from url""" + + data_dir = "./" + DATA_FOLDER + if not os.path.exists(data_dir): + os.makedirs(data_dir) + downloaded_filepath = os.path.join(data_dir, filename) + print("Downloading data...") + urllib.request.urlretrieve(url, downloaded_filepath) + print("Extracting data...") + zipfile = ZipFile(downloaded_filepath) + zipfile.extractall(data_dir) + zipfile.close() + print("Finished data downloading and extraction.") + + +download_data(DATA_URL) +``` + +Let's read the training data into a Spark DataFrame. + + +```python +df_train = pd.read_csv( + os.path.join(".", DATA_FOLDER, TRAIN_FILENAME), + header=None, + names=COL_NAMES, + encoding=ENCODING, +) +df_train = spark.createDataFrame(df_train, verifySchema=False) +``` + +We can take a look at the training data and check how many samples it has. We should see that there are 1.6 million samples in the training data. There are 6 fields in the training data: +* label: the sentiment of the tweet (0.0 = negative, 2.0 = neutral, 4.0 = positive) +* id: the id of the tweet +* date: the date of the tweet +* query_string: The query used to extract the data. If there is no query, then this value is NO_QUERY. +* user: the user that tweeted +* text: the text of the tweet + + +```python +df_train.limit(10).toPandas() +``` + + +```python +print("Number of training samples: ", df_train.count()) +``` + +Before training the model, we randomly permute the data to mix negative and positive samples. This is helpful for properly training online learning algorithms like VW. To speed up model training, we use a subset of the data to train the model. If training with the full training set, typically you will see better performance of the model on the test set. + + +```python +df_train = ( + df_train.orderBy(rand()) + .limit(100000) + .withColumn("label", when(col("label") > 0, 1.0).otherwise(0.0)) + .select(["label", "text"]) +) +``` + +## VW SynapseML Training + +Now we are ready to define a pipeline which consists of feature engineering steps and the VW model. + + +```python +# Specify featurizers +tokenizer = RegexTokenizer(inputCol="text", outputCol="words") + +count_vectorizer = CountVectorizer(inputCol="words", outputCol="features") + +# Define VW classification model +args = "--loss_function=logistic --quiet --holdout_off" +vw_model = VowpalWabbitClassifier( + featuresCol="features", labelCol="label", passThroughArgs=args, numPasses=10 +) + +# Create a pipeline +vw_pipeline = Pipeline(stages=[tokenizer, count_vectorizer, vw_model]) +``` + +With the prepared training data, we can fit the model pipeline as follows. + + +```python +vw_trained = vw_pipeline.fit(df_train) +``` + +## Model Performance Evaluation + +After training the model, we evaluate the performance of the model using the test set which is manually labeled. + + +```python +df_test = pd.read_csv( + os.path.join(".", DATA_FOLDER, TEST_FILENAME), + header=None, + names=COL_NAMES, + encoding=ENCODING, +) +df_test = spark.createDataFrame(df_test, verifySchema=False) +``` + +We only use positive and negative tweets in the test set to evaluate the model, since our model is a binary classification model trained with only positive and negative tweets. + + +```python +print("Number of test samples before filtering: ", df_test.count()) +df_test = ( + df_test.filter(col("label") != 2.0) + .withColumn("label", when(col("label") > 0, 1.0).otherwise(0.0)) + .select(["label", "text"]) +) +print("Number of test samples after filtering: ", df_test.count()) +``` + + +```python +# Make predictions +predictions = vw_trained.transform(df_test) +predictions.limit(10).toPandas() +``` + + +```python +# Compute model performance metrics +metrics = ComputeModelStatistics( + evaluationMetric="classification", labelCol="label", scoredLabelsCol="prediction" +).transform(predictions) +metrics.toPandas() +``` + + +```python +# Utility class for plotting ROC curve (https://stackoverflow.com/questions/52847408/pyspark-extract-roc-curve) +class CurveMetrics(BinaryClassificationMetrics): + def __init__(self, *args): + super(CurveMetrics, self).__init__(*args) + + def get_curve(self, method): + rdd = getattr(self._java_model, method)().toJavaRDD() + points = [] + for row in rdd.collect(): + points += [(float(row._1()), float(row._2()))] + return points + + +preds = predictions.select("label", "probability").rdd.map( + lambda row: (float(row["probability"][1]), float(row["label"])) +) +roc_points = CurveMetrics(preds).get_curve("roc") + +# Plot ROC curve +fig = plt.figure() +x_val = [x[0] for x in roc_points] +y_val = [x[1] for x in roc_points] +plt.title("ROC curve on test set") +plt.xlabel("False positive rate") +plt.ylabel("True positive rate") +plt.plot(x_val, y_val) +# Use display() if you're on Azure Databricks or you can do plt.show() +plt.show() +``` + +You should see an ROC curve like the following after the above cell is executed. + + diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Distributed Training.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Distributed Training.md new file mode 100644 index 0000000000..8b9fd9d2ea --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Distributed Training.md @@ -0,0 +1,76 @@ +--- +title: Distributed Training +sidebar_label: Distributed Training +--- + +### Why Simple Deep Learning +Creating a Spark-compatible deep learning system can be challenging for users who may not have a +thorough understanding of deep learning and distributed systems. Additionally, writing custom deep learning +scripts may be a cumbersome and time-consuming task. +SynapseML aims to simplify this process by building on top of the [Horovod](https://github.com/horovod/horovod) Estimator, a general-purpose +distributed deep learning model that is compatible with SparkML, and [Pytorch-lightning](https://github.com/Lightning-AI/lightning), +a lightweight wrapper around the popular PyTorch deep learning framework. + +SynapseML's simple deep learning toolkit makes it easy to use modern deep learning methods in Apache Spark. +By providing a collection of Estimators, SynapseML enables users to perform distributed transfer learning on +spark clusters to solve custom machine learning tasks without requiring in-depth domain expertise. +Whether you're a data scientist, data engineer, or business analyst this project aims to make modern deep-learning methods easy to use for new domain-specific problems. + +### SynapseML's Simple DNN +SynapseML goes beyond the limited support for deep networks in SparkML and provides out-of-the-box solutions for various common scenarios: +- Visual Classification: Users can apply transfer learning for image classification tasks, using pretrained models and fine-tuning them to solve custom classification problems. +- Text Classification: SynapseML simplifies the process of implementing natural language processing tasks such as sentiment analysis, text classification, and language modeling by providing prebuilt models and tools. +- And more coming soon + +### Why Horovod +Horovod is a distributed deep learning framework developed by Uber, which has become popular for its ability to scale +deep learning tasks across multiple GPUs and compute nodes efficiently. It's designed to work with TensorFlow, Keras, PyTorch, and Apache MXNet. +- Scalability: Horovod uses efficient communication algorithms like ring-allreduce and hierarchical all reduce, which allow it to scale the training process across multiple GPUs and nodes without significant performance degradation. +- Easy Integration: Horovod can be easily integrated into existing deep learning codebases with minimal changes, making it a popular choice for distributed training. +- Fault Tolerance: Horovod provides fault tolerance features like elastic training. It can dynamically adapt to changes in the number of workers or recover from failures. +- Community Support: Horovod has an active community and is widely used in the industry, which ensures that the framework is continually updated and improved. + +### Why Pytorch Lightning +PyTorch Lightning is a lightweight wrapper around the popular PyTorch deep learning framework, designed to make it +easier to write clean, modular, and scalable deep learning code. PyTorch Lightning has several advantages that +make it an excellent choice for SynapseML's Simple Deep Learning: +- Code Organization: PyTorch Lightning promotes a clean and organized code structure by separating the research code from the engineering code. This property makes it easier to maintain, debug, and share deep learning models. +- Flexibility: PyTorch Lightning retains the flexibility and expressiveness of PyTorch while adding useful abstractions to simplify the training loop and other boilerplate code. +- Built-in Best Practices: PyTorch Lightning incorporates many best practices for deep learning, such as automatic optimization, gradient clipping, and learning rate scheduling, making it easier for users to achieve optimal performance. +- Compatibility: PyTorch Lightning is compatible with a wide range of popular tools and frameworks, including Horovod, which allows users to easily use distributed training capabilities. +- Rapid Development: With PyTorch Lightning, users can quickly experiment with different model architectures and training strategies without worrying about low-level implementation details. + +### Sample usage with DeepVisionClassifier +DeepVisionClassifier incorporates all models supported by [torchvision](https://github.com/pytorch/vision). +:::note +The current version is based on pytorch_lightning v1.5.0 and torchvision v0.12.0 +::: +By providing a spark dataframe that contains an 'imageCol' and 'labelCol', you could directly apply 'transform' function +on it with DeepVisionClassifier. +```python +train_df = spark.createDataframe([ + ("PATH_TO_IMAGE_1.jpg", 1), + ("PATH_TO_IMAGE_2.jpg", 2) +], ["image", "label"]) + +deep_vision_classifier = DeepVisionClassifier( + backbone="resnet50", # Put your backbone here + store=store, # Corresponding store + callbacks=callbacks, # Optional callbacks + num_classes=17, + batch_size=16, + epochs=epochs, + validation=0.1, +) + +deep_vision_model = deep_vision_classifier.fit(train_df) +``` +DeepVisionClassifier does distributed-training on spark with Horovod under the hood, after this fitting process it returns +a DeepVisionModel. With this code you could use the model for inference directly: +```python +pred_df = deep_vision_model.transform(test_df) +``` + +## Examples +- [Quickstart - Fine-tune a Text Classifier](../Quickstart%20-%20Fine-tune%20a%20Text%20Classifier) +- [Quickstart - Fine-tune a Vision Classifier](../Quickstart%20-%20Fine-tune%20a%20Vision%20Classifier) diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Getting Started.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Getting Started.md new file mode 100644 index 0000000000..3ae4371d32 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Getting Started.md @@ -0,0 +1,42 @@ +--- +title: Getting Started +sidebar_label: Getting Started +--- + +:::note +This is a sample with databricks 10.4.x-gpu-ml-scala2.12 runtime +::: + +## 1. Reinstall horovod using our prepared script + +We build on top of torchvision, horovod and pytorch_lightning, so we need to reinstall horovod by building on specific versions of those packages. +Download our [horovod installation script](https://mmlspark.blob.core.windows.net/publicwasb/horovod_installation.sh) and upload +it to databricks dbfs. + +Add the path of this script to `Init Scripts` section when configuring the spark cluster. +Restarting the cluster automatically installs horovod v0.25.0 with pytorch_lightning v1.5.0 and torchvision v0.12.0. + +## 2. Install SynapseML Deep Learning Component + +You could install the single synapseml-deep-learning wheel package to get the full functionality of deep vision classification. +Run the following command: +```powershell +pip install synapseml==1.0.7 +``` + +An alternative is installing the SynapseML jar package in library management section, by adding: +``` +Coordinate: com.microsoft.azure:synapseml_2.12:1.0.7 +Repository: https://mmlspark.azureedge.net/maven +``` +:::note +If you install the jar package, follow the first two cells of this [sample](../Quickstart%20-%20Fine-tune%20a%20Vision%20Classifier#environment-setup----reinstall-horovod-based-on-new-version-of-pytorch) +to ensure horovod recognizes SynapseML. +::: + +## 3. Try our sample notebook + +You could follow the rest of this [sample](../Quickstart%20-%20Fine-Tune a Vision Classifier) and have a try on your own dataset. + +Supported models (`backbone` parameter for `DeepVisionClassifer`) should be string format of [Torchvision-supported models](https://github.com/pytorch/vision/blob/v0.12.0/torchvision/models/__init__.py); +You could also check by running `backbone in torchvision.models.__dict__`. diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/ONNX.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/ONNX.md new file mode 100644 index 0000000000..5d45e38679 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/ONNX.md @@ -0,0 +1,108 @@ +--- +title: ONNX +hide_title: true +sidebar_label: ONNX +description: Learn how to use the ONNX model transformer to run inference for an ONNX model on Spark. +--- + +# ONNX model inferencing on Spark + +## ONNX + +[ONNX](https://onnx.ai/) is an open format to represent both deep learning and traditional machine learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them. + +SynapseML now includes a Spark transformer to bring a trained ONNX model to Apache Spark, so you can run inference on your data with Spark's large-scale data processing power. + +## ONNXHub +Although you can use your own local model, many popular existing models are provided through the ONNXHub. You can use +a model's ONNXHub name (for example "MNIST") and download the bytes of the model, and some metadata about the model. You can also list +available models, optionally filtering by name or tags. + +```scala + // List models + val hub = new ONNXHub() + val models = hub.listModels(model = Some("mnist"), tags = Some(Seq("vision"))) + + // Retrieve and transform with a model + val info = hub.getModelInfo("resnet50") + val bytes = hub.load(name) + val model = new ONNXModel() + .setModelPayload(bytes) + .setFeedDict(Map("data" -> "features")) + .setFetchDict(Map("rawPrediction" -> "resnetv24_dense0_fwd")) + .setSoftMaxDict(Map("rawPrediction" -> "probability")) + .setArgMaxDict(Map("rawPrediction" -> "prediction")) + .setMiniBatchSize(1) + + val (probability, _) = model.transform({YOUR_DATAFRAME}) + .select("probability", "prediction") + .as[(Vector, Double)] + .head +``` + +## Usage + +1. Create a `com.microsoft.azure.synapse.ml.onnx.ONNXModel` object and use `setModelLocation` or `setModelPayload` to load the ONNX model. + + For example: + + ```scala + val onnx = new ONNXModel().setModelLocation("/path/to/model.onnx") + ``` + + Optionally, create the model from the ONNXHub. + + ```scala + val onnx = new ONNXModel().setModelPayload(hub.load("MNIST")) + ``` +2. Use ONNX visualization tool (for example, [Netron](https://netron.app/)) to inspect the ONNX model's input and output nodes. + + ![Screenshot that illustrates an ONNX model's input and output nodes](https://mmlspark.blob.core.windows.net/graphics/ONNXModelInputsOutputs.png) + +3. Set the parameters properly to the `ONNXModel` object. + + The `com.microsoft.azure.synapse.ml.onnx.ONNXModel` class provides a set of parameters to control the behavior of the inference. + + | Parameter | Description | Default Value | + |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------| + | feedDict | Map the ONNX model's expected input node names to the input DataFrame's column names. Make sure the input DataFrame's column schema matches with the corresponding input's shape of the ONNX model. For example, an image classification model may have an input node of shape `[1, 3, 224, 224]` with type Float. It's assumed that the first dimension (1) is the batch size. Then the input DataFrame's corresponding column's type should be `ArrayType(ArrayType(ArrayType(FloatType)))`. | None | + | fetchDict | Map the output DataFrame's column names to the ONNX model's output node names. NOTE: If you put outputs that are intermediate in the model, transform will automatically slice at those outputs. See the section on [Slicing](#slicing). | None | + | miniBatcher | Specify the MiniBatcher to use. | `FixedMiniBatchTransformer` with batch size 10 | + | softMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to `Map("rawPrediction" -> "probability")` to obtain the probability outputs. | None | + | argMaxDict | A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label. | None | + | deviceType | Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used. | None | + | optimizationLevel | Specify the [optimization level](https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#graph-optimization-levels) for the ONNX graph optimizations. Supported values are: `NO_OPT`, `BASIC_OPT`, `EXTENDED_OPT`, `ALL_OPT`. | `ALL_OPT` | + +4. Call `transform` method to run inference on the input DataFrame. + +## Model Slicing +By default, an ONNX model is treated as a black box with inputs and outputs. +If you want to use intermediate nodes of a model, you can slice the model at particular nodes. Slicing will create a new model, +keeping only parts of the model that are needed for those nodes. This new model's outputs will be the outputs from +the intermediate nodes. You can save the sliced model and use it to transform just like any other ONNXModel. + +This slicing feature is used implicitly by the ImageFeaturizer, which uses ONNX models. The OnnxHub manifest entry for each model +includes which intermediate node outputs should be used for featurization, so the ImageFeaturizer will automatically slice at the correct nodes. + +The below example shows how to perform the slicing manually with a direct ONNXModel. + +```scala + // create a df: Dataframe with image data + val hub = new ONNXHub() + val info = hub.getModelInfo("resnet50") + val bytes = hub.load(name) + val intermediateOutputName = "resnetv24_pool1_fwd" + val slicedModel = new ONNXModel() + .setModelPayload(bytes) + .setFeedDict(Map("data" -> "features")) + .setFetchDict(Map("rawFeatures" -> intermediateOutputName)) // automatic slicing based on fetch dictionary + // -- or -- + // .sliceAtOutput(intermediateOutputName) // manual slicing + + val slicedModelDf = slicedModel.transform(df) +``` + +## Example + +- [Image Explainers](../../Responsible%20AI/Image%20Explainers) +- [Quickstart - ONNX Model Inference](../Quickstart%20-%20ONNX%20Model%20Inference) diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.md new file mode 100644 index 0000000000..446a037fbd --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.md @@ -0,0 +1,105 @@ +--- +title: Quickstart - Fine-tune a Text Classifier +hide_title: true +status: stable +--- +## Deep Learning - Deep Text Classifier + +### Environment Setup on databricks + + +```python +# install cloudpickle 2.0.0 to add synapse module for usage of horovod +%pip install cloudpickle==2.0.0 --force-reinstall --no-deps +``` + + +```python +import synapse +import cloudpickle + +cloudpickle.register_pickle_by_value(synapse) +``` + + +```python +! horovodrun --check-build +``` + +### Read Dataset + + +```python +import urllib + +urllib.request.urlretrieve( + "https://mmlspark.blob.core.windows.net/publicwasb/text_classification/Emotion_classification.csv", + "/tmp/Emotion_classification.csv", +) + +import pandas as pd +from pyspark.ml.feature import StringIndexer + +df = pd.read_csv("/tmp/Emotion_classification.csv") +df = spark.createDataFrame(df) + +indexer = StringIndexer(inputCol="Emotion", outputCol="label") +indexer_model = indexer.fit(df) +df = indexer_model.transform(df).drop(("Emotion")) + +train_df, test_df = df.randomSplit([0.85, 0.15], seed=1) +display(train_df) +``` + +### Training + + +```python +from horovod.spark.common.store import DBFSLocalStore +from pytorch_lightning.callbacks import ModelCheckpoint +from synapse.ml.dl import * +import uuid + +checkpoint = "bert-base-uncased" +run_output_dir = f"/dbfs/FileStore/test/{checkpoint}/{str(uuid.uuid4())[:8]}" +store = DBFSLocalStore(run_output_dir) + +epochs = 1 + +callbacks = [ModelCheckpoint(filename="{epoch}-{train_loss:.2f}")] +``` + + +```python +deep_text_classifier = DeepTextClassifier( + checkpoint=checkpoint, + store=store, + callbacks=callbacks, + num_classes=6, + batch_size=16, + epochs=epochs, + validation=0.1, + text_col="Text", +) + +deep_text_model = deep_text_classifier.fit(train_df.limit(6000).repartition(50)) +``` + +### Prediction + + +```python +from pyspark.ml.evaluation import MulticlassClassificationEvaluator + +pred_df = deep_text_model.transform(test_df.limit(500)) +evaluator = MulticlassClassificationEvaluator( + predictionCol="prediction", labelCol="label", metricName="accuracy" +) +print("Test accuracy:", evaluator.evaluate(pred_df)) +``` + + +```python +# Cleanup the output dir for test +dbutils.fs.rm(run_output_dir, True) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.md new file mode 100644 index 0000000000..f4710c5a68 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.md @@ -0,0 +1,139 @@ +--- +title: Quickstart - Fine-tune a Vision Classifier +hide_title: true +status: stable +--- +## Deep Learning - Deep Vision Classifier + +### Environment Setup on databricks +### -- reinstall horovod based on new version of pytorch + + +```python +# install cloudpickle 2.0.0 to add synapse module for usage of horovod +%pip install cloudpickle==2.0.0 --force-reinstall --no-deps +``` + + +```python +import synapse +import cloudpickle +import os +import urllib.request +import zipfile + +cloudpickle.register_pickle_by_value(synapse) +``` + + +```python +! horovodrun --check-build +``` + + +```python +from pyspark.sql.functions import udf, col, regexp_replace +from pyspark.sql.types import IntegerType +from pyspark.ml.evaluation import MulticlassClassificationEvaluator +``` + +### Read Dataset + + +```python +folder_path = "/tmp/flowers_prepped" +zip_url = "https://mmlspark.blob.core.windows.net/datasets/Flowers/flowers_prepped.zip" +zip_path = "/dbfs/tmp/flowers_prepped.zip" + +if not os.path.exists("/dbfs" + folder_path): + urllib.request.urlretrieve(zip_url, zip_path) + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall("/dbfs/tmp") + os.remove(zip_path) +``` + + +```python +def assign_label(path): + num = int(path.split("/")[-1].split(".")[0].split("_")[1]) + return num // 81 + + +assign_label_udf = udf(assign_label, IntegerType()) +``` + + +```python +# These files are already uploaded for build test machine +train_df = ( + spark.read.format("binaryFile") + .option("pathGlobFilter", "*.jpg") + .load(folder_path + "/train") + .withColumn("image", regexp_replace("path", "dbfs:", "/dbfs")) + .withColumn("label", assign_label_udf(col("path"))) + .select("image", "label") +) + +display(train_df.limit(100)) +``` + + +```python +test_df = ( + spark.read.format("binaryFile") + .option("pathGlobFilter", "*.jpg") + .load(folder_path + "/test") + .withColumn("image", regexp_replace("path", "dbfs:", "/dbfs")) + .withColumn("label", assign_label_udf(col("path"))) + .select("image", "label") +) +``` + +### Training + + +```python +from horovod.spark.common.store import DBFSLocalStore +from pytorch_lightning.callbacks import ModelCheckpoint +from synapse.ml.dl import * +import uuid + +run_output_dir = f"/dbfs/FileStore/test/resnet50/{str(uuid.uuid4())[:8]}" +store = DBFSLocalStore(run_output_dir) + +epochs = 10 + +callbacks = [ModelCheckpoint(filename="{epoch}-{train_loss:.2f}")] +``` + + +```python +deep_vision_classifier = DeepVisionClassifier( + backbone="resnet50", + store=store, + callbacks=callbacks, + num_classes=17, + batch_size=16, + epochs=epochs, + validation=0.1, +) + +deep_vision_model = deep_vision_classifier.fit(train_df) +``` + +### Prediction + + +```python +pred_df = deep_vision_model.transform(test_df) +evaluator = MulticlassClassificationEvaluator( + predictionCol="prediction", labelCol="label", metricName="accuracy" +) +print("Test accuracy:", evaluator.evaluate(pred_df)) +``` + + +```python +# Cleanup the output dir for test +dbutils.fs.rm(run_output_dir, True) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.md new file mode 100644 index 0000000000..4c19677577 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - ONNX Model Inference.md @@ -0,0 +1,172 @@ +--- +title: Quickstart - ONNX Model Inference +hide_title: true +status: stable +--- +# ONNX Inference on Spark + +In this example, you train a LightGBM model and convert the model to [ONNX](https://onnx.ai/) format. Once converted, you use the model to infer some testing data on Spark. + +This example uses the following Python packages and versions: + +- `onnxmltools==1.7.0` +- `lightgbm==3.2.1` + + +## Load the example data + +To load the example data, add the following code examples to cells in your notebook and then run the cells: + + +```python +%pip install lightgbm onnxmltools==1.7.0 +``` + + +```python +df = ( + spark.read.format("csv") + .option("header", True) + .option("inferSchema", True) + .load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv" + ) +) + +display(df) +``` + +The output should look similar to the following table, though the values and number of rows may differ: + +| Interest Coverage Ratio | Net Income Flag | Equity to Liability | +| ----- | ----- | ----- | +| 0.5641 | 1.0 | 0.0165 | +| 0.5702 | 1.0 | 0.0208 | +| 0.5673 | 1.0 | 0.0165 | + +## Use LightGBM to train a model + + +```python +from pyspark.ml.feature import VectorAssembler +from synapse.ml.lightgbm import LightGBMClassifier + +feature_cols = df.columns[1:] +featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") + +train_data = featurizer.transform(df)["Bankrupt?", "features"] + +model = ( + LightGBMClassifier(featuresCol="features", labelCol="Bankrupt?") + .setEarlyStoppingRound(300) + .setLambdaL1(0.5) + .setNumIterations(1000) + .setNumThreads(-1) + .setMaxDeltaStep(0.5) + .setNumLeaves(31) + .setMaxDepth(-1) + .setBaggingFraction(0.7) + .setFeatureFraction(0.7) + .setBaggingFreq(2) + .setObjective("binary") + .setIsUnbalance(True) + .setMinSumHessianInLeaf(20) + .setMinGainToSplit(0.01) +) + +model = model.fit(train_data) +``` + +## Convert the model to ONNX format + +The following code exports the trained model to a LightGBM booster and then converts it to ONNX format: + + +```python +from synapse.ml.core.platform import running_on_binder + +if running_on_binder(): + from IPython import get_ipython +``` + + +```python +import lightgbm as lgb +from lightgbm import Booster, LGBMClassifier + + +def convertModel(lgbm_model: LGBMClassifier or Booster, input_size: int) -> bytes: + from onnxmltools.convert import convert_lightgbm + from onnxconverter_common.data_types import FloatTensorType + + initial_types = [("input", FloatTensorType([-1, input_size]))] + onnx_model = convert_lightgbm( + lgbm_model, initial_types=initial_types, target_opset=9 + ) + return onnx_model.SerializeToString() + + +booster_model_str = model.getLightGBMBooster().modelStr().get() +booster = lgb.Booster(model_str=booster_model_str) +model_payload_ml = convertModel(booster, len(feature_cols)) +``` + +After conversion, load the ONNX payload into an `ONNXModel` and inspect the model inputs and outputs: + + +```python +from synapse.ml.onnx import ONNXModel + +onnx_ml = ONNXModel().setModelPayload(model_payload_ml) + +print("Model inputs:" + str(onnx_ml.getModelInputs())) +print("Model outputs:" + str(onnx_ml.getModelOutputs())) +``` + +Map the model input to the input dataframe's column name (FeedDict), and map the output dataframe's column names to the model outputs (FetchDict). + + +```python +onnx_ml = ( + onnx_ml.setDeviceType("CPU") + .setFeedDict({"input": "features"}) + .setFetchDict({"probability": "probabilities", "prediction": "label"}) + .setMiniBatchSize(5000) +) +``` + +## Use the model for inference + +To perform inference with the model, the following code creates test data and transforms the data through the ONNX model. + + +```python +from pyspark.ml.feature import VectorAssembler +import pandas as pd +import numpy as np + +n = 1000 * 1000 +m = 95 +test = np.random.rand(n, m) +testPdf = pd.DataFrame(test) +cols = list(map(str, testPdf.columns)) +testDf = spark.createDataFrame(testPdf) +testDf = testDf.union(testDf).repartition(200) +testDf = ( + VectorAssembler() + .setInputCols(cols) + .setOutputCol("features") + .transform(testDf) + .drop(*cols) + .cache() +) + +display(onnx_ml.transform(testDf)) +``` + +The output should look similar to the following table, though the values and number of rows may differ: + +| Index | Features | Prediction | Probability | +| ----- | ----- | ----- | ----- | +| 1 | `"{"type":1,"values":[0.105...` | 0 | `"{"0":0.835...` | +| 2 | `"{"type":1,"values":[0.814...` | 0 | `"{"0":0.658...` | diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.md new file mode 100644 index 0000000000..be32797ddc --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Deep Learning/Quickstart - Transfer Learn for Image Classification.md @@ -0,0 +1,139 @@ +--- +title: Quickstart - Transfer Learn for Image Classification +hide_title: true +status: stable +--- +## Deep Learning - Flower Image Classification + + +```python +from pyspark.ml import Transformer, Estimator, Pipeline +from pyspark.ml.classification import LogisticRegression +import sys, time +``` + + +```python +# Load the images +# use flowers_and_labels.parquet on larger cluster in order to get better results +imagesWithLabels = ( + spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet" + ) + .withColumnRenamed("bytes", "image") + .sample(0.1) +) + +imagesWithLabels.printSchema() +``` + +![Smiley face](https://i.imgur.com/p2KgdYL.jpg) + + +```python +from synapse.ml.opencv import ImageTransformer +from synapse.ml.image import UnrollImage +from synapse.ml.onnx import ImageFeaturizer +from synapse.ml.stages import * + +# Make some featurizers +it = ImageTransformer().setOutputCol("scaled").resize(size=(60, 60)) + +ur = UnrollImage().setInputCol("scaled").setOutputCol("features") + +dc1 = DropColumns().setCols(["scaled", "image"]) + +lr1 = ( + LogisticRegression().setMaxIter(8).setFeaturesCol("features").setLabelCol("labels") +) + +dc2 = DropColumns().setCols(["features"]) + +basicModel = Pipeline(stages=[it, ur, dc1, lr1, dc2]) +``` + + +```python +resnet = ( + ImageFeaturizer().setInputCol("image").setOutputCol("features").setModel("ResNet50") +) + +dc3 = DropColumns().setCols(["image"]) + +lr2 = ( + LogisticRegression().setMaxIter(8).setFeaturesCol("features").setLabelCol("labels") +) + +dc4 = DropColumns().setCols(["features"]) + +deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4]) +``` + +![Resnet 18](https://i.imgur.com/Mb4Dyou.png) + +### How does it work? + +![Convolutional network weights](http://i.stack.imgur.com/Hl2H6.png) + +### Run the experiment + + +```python +def timedExperiment(model, train, test): + start = time.time() + result = model.fit(train).transform(test).toPandas() + print("Experiment took {}s".format(time.time() - start)) + return result +``` + + +```python +train, test = imagesWithLabels.randomSplit([0.8, 0.2]) +train.count(), test.count() +``` + + +```python +basicResults = timedExperiment(basicModel, train, test) +``` + + +```python +deepResults = timedExperiment(deepModel, train, test) +``` + +### Plot confusion matrix. + + +```python +import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix +import numpy as np + + +def evaluate(results, name): + y, y_hat = results["labels"], results["prediction"] + y = [int(l) for l in y] + + accuracy = np.mean([1.0 if pred == true else 0.0 for (pred, true) in zip(y_hat, y)]) + cm = confusion_matrix(y, y_hat) + cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] + + plt.text( + 40, 10, "$Accuracy$ $=$ ${}\%$".format(round(accuracy * 100, 1)), fontsize=14 + ) + plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues) + plt.colorbar() + plt.xlabel("$Predicted$ $label$", fontsize=18) + plt.ylabel("$True$ $Label$", fontsize=18) + plt.title("$Normalized$ $CM$ $for$ ${}$".format(name)) + + +plt.figure(figsize=(12, 5)) +plt.subplot(1, 2, 1) +evaluate(deepResults, "CNTKModel + LR") +plt.subplot(1, 2, 2) +evaluate(basicResults, "LR") +# Note that on the larger dataset the accuracy will bump up from 44% to >90% +display(plt.show()) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/HyperOpt.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/HyperOpt.md new file mode 100644 index 0000000000..63b915e4ac --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/HyperOpt.md @@ -0,0 +1,335 @@ +--- +title: HyperOpt +hide_title: true +status: stable +--- +# Hyperparameter tuning: SynapseML with Hyperopt + +[SynapseML](https://github.com/microsoft/SynapseML) is an open-source library that simplifies the creation of massively scalable machine learning (ML) pipelines. SynapseML provides simple, composable, and distributed APIs for a wide variety of different machine learning tasks such as text analytics, vision, anomaly detection, and many others. + +[Hyperopt](https://github.com/hyperopt/hyperopt), on the other hand, is a Python library for serial and parallel optimization over complex search spaces, including real-valued, discrete, and conditional dimensions. + +This guide showcases the process of tuning a distributed algorithm in Spark with SynapseML and Hyperopt. + +The use case of this guide is for distributed machine learning in Python that requires hyperparameter tuning. It provides a demo on how to tune hyperparameters for a machine learning workflow in SynapseML and can be used as a reference to tune other distributed machine learning algorithms from Spark MLlib or other libraries. + +The guide includes two sections: +* Running distributed training with SynapseML without hyperparameter tuning. +* Using Hyperopt to tune hyperparameters in the distributed training workflow. +## Prerequisites + - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](../../../Use%20with%20MLFlow/Overview/). + +## Requirements + - Install HyperOpt + + +```python +%pip install hyperopt mlflow +``` + +## MLflow autologging + +To track model training and tuning with MLflow, you could enable MLflow autologging by running `mlflow.pyspark.ml.autolog()`. + + +```python +from synapse.ml.core.platform import * + +if running_on_synapse_internal(): + experiment_name = "hyperopt-synapseml" +elif running_on_synapse(): + experiment_name = "hyperopt-synapseml" +else: + experiment_name = "/Shared/hyperopt-synapseml" +``` + + +```python +import mlflow + +mlflow.__version__ +``` + + +```python +# Set pyspark autologging logModelAllowlist to include SynapseML models +spark.sparkContext._conf.set( + "spark.mlflow.pysparkml.autolog.logModelAllowlistFile", + "https://mmlspark.blob.core.windows.net/publicwasb/log_model_allowlist.txt", +) +``` + + +```python +# enable autologging +mlflow.pyspark.ml.autolog() +``` + +### Set experiment name for tracking + + +```python +# Set MLflow experiment. + +if running_on_synapse(): + from notebookutils.mssparkutils import azureML + + linked_service = "AzureMLService1" # use your linked service name + ws = azureML.getWorkspace(linked_service) + mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri()) +mlflow.set_experiment(experiment_name) +``` + +## Part 1. Run distributed training using MLlib + +This section shows a simple example of distributed training using SynapseML. For more information and examples, visit the official [website](https://microsoft.github.io/SynapseML/) + +## Prepare Dataset +We use [*California Housing* dataset](https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset). +The data was derived from the 1990 U.S. census. It consists of 20640 entries with 8 features. +We use `sklearn.datasets` module to download it easily, then split the set into training and testing by 75/25. + + +```python +import numpy as np +import pandas as pd +from sklearn.datasets import fetch_california_housing +import time +``` + + +```python +try: + california = fetch_california_housing() +except EOFError: + print("Encountered EOFError while downloading, retrying once...") + time.sleep(5) + california = fetch_california_housing() + +feature_cols = ["f" + str(i) for i in range(california.data.shape[1])] +header = ["target"] + feature_cols +df = spark.createDataFrame( + pd.DataFrame( + data=np.column_stack((california.target, california.data)), columns=header + ) +).repartition(1) + +print("Dataframe has {} rows".format(df.count())) +display(df) +``` + +Following is the summary of the data set. + + +```python +display(df.summary().toPandas()) +``` + +### Create a function to train a model + +In this section, you define a function to train a gradient boosting model with SynapseML LightgbmRegressor. Wrapping the training code in a function is important for passing the function to Hyperopt for tuning later. + +We evaluate the prediction result by using `synapse.ml.train.ComputeModelStatistics` which returns four metrics: +* [MSE (Mean Squared Error)](https://en.wikipedia.org/wiki/Mean_squared_error) +* [RMSE (Root Mean Squared Error)](https://en.wikipedia.org/wiki/Root-mean-square_deviation) = sqrt(MSE) +* [R Squared](https://en.wikipedia.org/wiki/Coefficient_of_determination) +* [MAE (Mean Absolute Error)](https://en.wikipedia.org/wiki/Mean_absolute_error) + + +```python +from pyspark.ml.feature import VectorAssembler + +# Convert features into a single vector column +featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") +data = featurizer.transform(df)["target", "features"] + +train_data, test_data = data.randomSplit([0.75, 0.25], seed=42) +train_data, validation_data = train_data.randomSplit([0.85, 0.15], seed=42) + +display(train_data) + +# Using one partition since the training dataset is very small +repartitioned_data = train_data.repartition(1).cache() +``` + + +```python +from synapse.ml.lightgbm import LightGBMRegressor +from synapse.ml.train import ComputeModelStatistics + + +def train_tree(alpha, learningRate, numLeaves, numIterations): + """ + This train() function: + - takes hyperparameters as inputs (for tuning later) + - returns the F1 score on the validation dataset + + Wrapping code as a function makes it easier to reuse the code later with Hyperopt. + """ + # Use MLflow to track training. + # Specify "nested=True" since this single model will be logged as a child run of Hyperopt's run. + with mlflow.start_run(nested=True): + + lgr = LightGBMRegressor( + objective="quantile", + alpha=alpha, + learningRate=learningRate, + numLeaves=numLeaves, + labelCol="target", + numIterations=numIterations, + ) + + model = lgr.fit(repartitioned_data) + + cms = ComputeModelStatistics( + evaluationMetric="regression", labelCol="target", scoresCol="prediction" + ) + + # Define an evaluation metric and evaluate the model on the test dataset. + predictions = model.transform(test_data) + metrics = cms.transform(predictions).collect()[0].asDict() + + # log metrics with mlflow + mlflow.log_metric("MSE", metrics["mean_squared_error"]) + mlflow.log_metric("RMSE", metrics["root_mean_squared_error"]) + mlflow.log_metric("R^2", metrics["R^2"]) + mlflow.log_metric("MAE", metrics["mean_absolute_error"]) + + return model, metrics["R^2"] +``` + +Run the training function to make sure it works. +It's a good idea to make sure training code runs before adding in tuning. + + +```python +initial_model, val_metric = train_tree( + alpha=0.2, learningRate=0.3, numLeaves=31, numIterations=50 +) +print( + f"The trained decision tree achieved a R^2 of {val_metric} on the validation data" +) +``` + +## Part 2. Use Hyperopt to tune hyperparameters + +In the second section, the Hyperopt workflow is created by: +* Define a function to minimize +* Define a search space over hyperparameters +* Specifying the search algorithm and using `fmin()` for tuning the model. + +For more information about the Hyperopt APIs, see the [Hyperopt documentation](http://hyperopt.github.io/hyperopt/). + +### Define a function to minimize + +* Input: hyperparameters +* Internally: Reuse the training function defined above. +* Output: loss + + +```python +from hyperopt import fmin, tpe, hp, Trials, STATUS_OK + + +def train_with_hyperopt(params): + """ + An example train method that calls into MLlib. + This method is passed to hyperopt.fmin(). + + :param params: hyperparameters as a dict. Its structure is consistent with how search space is defined. See below. + :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run) + """ + # For integer parameters, make sure to convert them to int type if Hyperopt is searching over a continuous range of values. + alpha = params["alpha"] + learningRate = params["learningRate"] + numLeaves = int(params["numLeaves"]) + numIterations = int(params["numIterations"]) + + model, r_squared = train_tree(alpha, learningRate, numLeaves, numIterations) + + # Hyperopt expects you to return a loss (for which lower is better), so take the negative of the R^2 (for which higher is better). + loss = -r_squared + + return {"loss": loss, "status": STATUS_OK} +``` + +### Define the search space over hyperparameters + +This example tunes four hyperparameters: `alpha`, `learningRate`, `numLeaves` and `numIterations`. See the [Hyperopt documentation](https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions) for details on defining a search space and parameter expressions. + + +```python +space = { + "alpha": hp.uniform("alpha", 0, 1), + "learningRate": hp.uniform("learningRate", 0, 1), + "numLeaves": hp.uniformint("numLeaves", 30, 50), + "numIterations": hp.uniformint("numIterations", 20, 100), +} +``` + +### Tune the model using Hyperopt `fmin()` + +For tuning the model with Hyperopt's `fmin()`, the following steps are taken: +- Setting `max_evals` to the maximum number of points in the hyperparameter space to be tested. +- Specifying the search algorithm, either `hyperopt.tpe.suggest` or `hyperopt.rand.suggest`. + - `hyperopt.tpe.suggest`: Tree of Parzen Estimators, a Bayesian approach which iteratively and adaptively selects new hyperparameter settings to explore based on previous results + - `hyperopt.rand.suggest`: Random search, a non-adaptive approach that randomly samples the search space + +**Important:** +When using Hyperopt with SynapseML and other distributed training algorithms, do not pass a `trials` argument to `fmin()`. When you do not include the `trials` argument, Hyperopt uses the default `Trials` class, which runs on the cluster driver. Hyperopt needs to evaluate each trial on the driver node so that each trial can initiate distributed training jobs. + +Do not use the `SparkTrials` class with SynapseML. `SparkTrials` is designed to distribute trials for algorithms that are not themselves distributed. SynapseML uses distributed computing already and is not compatible with `SparkTrials`. + + +```python +algo = tpe.suggest + +with mlflow.start_run(): + best_params = fmin(fn=train_with_hyperopt, space=space, algo=algo, max_evals=8) +``` + + +```python +# Print out the parameters that produced the best model +best_params +``` + +### Retrain the model on the full training dataset + +For tuning, this workflow split the training dataset into training and validation subsets. Now, retrain the model using the "best" hyperparameters on the full training dataset. + + +```python +best_alpha = best_params["alpha"] +best_learningRate = best_params["learningRate"] +best_numIterations = int(best_params["numIterations"]) +best_numLeaves = int(best_params["numLeaves"]) + +final_model, val_r_squared = train_tree( + best_alpha, best_learningRate, best_numIterations, best_numLeaves +) +``` + +Use the test dataset to compare evaluation metrics for the initial and "best" models. + + +```python +# Define an evaluation metric and evaluate the model on the test dataset. +cms = ComputeModelStatistics( + evaluationMetric="regression", labelCol="target", scoresCol="prediction" +) + +initial_model_predictions = initial_model.transform(test_data) +initial_model_test_metric = ( + cms.transform(initial_model_predictions).collect()[0].asDict()["R^2"] +) + +final_model_predictions = final_model.transform(test_data) +final_model_test_metric = ( + cms.transform(final_model_predictions).collect()[0].asDict()["R^2"] +) + +print( + f"On the test data, the initial (untuned) model achieved R^2 {initial_model_test_metric}, and the final (tuned) model achieved {final_model_test_metric}." +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.md new file mode 100644 index 0000000000..c9a09114cc --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Hyperparameter Tuning/Quickstart - Random Search.md @@ -0,0 +1,99 @@ +--- +title: Quickstart - Random Search +hide_title: true +status: stable +--- +# HyperParameterTuning - Fighting Breast Cancer + +This tutorial shows how SynapseML can be used to identify the best combination of hyperparameters for your chosen classifiers, ultimately resulting in more accurate and reliable models. In order to demonstrate this, we'll show how to perform distributed randomized grid search hyperparameter tuning to build a model to identify breast cancer. + +## 1 - Set up dependencies +Start by importing pandas and setting up our Spark session. + +Next, read the data and split it into tuning and test sets. + + +```python +data = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet" +).cache() +tune, test = data.randomSplit([0.80, 0.20]) +tune.limit(10).toPandas() +``` + +Define the models to be used. + + +```python +from synapse.ml.automl import TuneHyperparameters +from synapse.ml.train import TrainClassifier +from pyspark.ml.classification import ( + LogisticRegression, + RandomForestClassifier, + GBTClassifier, +) + +logReg = LogisticRegression() +randForest = RandomForestClassifier() +gbt = GBTClassifier() +smlmodels = [logReg, randForest, gbt] +mmlmodels = [TrainClassifier(model=model, labelCol="Label") for model in smlmodels] +``` + +## 2 - Find the best model using AutoML + +Import SynapseML's AutoML classes from `synapse.ml.automl`. +Specify the hyperparameters using the `HyperparamBuilder`. Add either `DiscreteHyperParam` or `RangeHyperParam` hyperparameters. `TuneHyperparameters` will randomly choose values from a uniform distribution: + + +```python +from synapse.ml.automl import * + +paramBuilder = ( + HyperparamBuilder() + .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) + .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5, 10])) + .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3, 5])) + .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8, 16)) + .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3, 5])) +) +searchSpace = paramBuilder.build() +# The search space is a list of params to tuples of estimator and hyperparam +print(searchSpace) +randomSpace = RandomSpace(searchSpace) +``` + +Next, run TuneHyperparameters to get the best model. + + +```python +bestModel = TuneHyperparameters( + evaluationMetric="accuracy", + models=mmlmodels, + numFolds=2, + numRuns=len(mmlmodels) * 2, + parallelism=1, + paramSpace=randomSpace.space(), + seed=0, +).fit(tune) +``` + +## 3 - Evaluate the model +We can view the best model's parameters and retrieve the underlying best model pipeline + + +```python +print(bestModel.getBestModelInfo()) +print(bestModel.getBestModel()) +``` + +We can score against the test set and view metrics. + + +```python +from synapse.ml.train import ComputeModelStatistics + +prediction = bestModel.transform(test) +metrics = ComputeModelStatistics().transform(prediction) +metrics.limit(10).toPandas() +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Overview.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Overview.md new file mode 100644 index 0000000000..f5979e1072 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Overview.md @@ -0,0 +1,262 @@ +--- +title: Overview +hide_title: true +sidebar_label: Overview +--- + +# LightGBM on Apache Spark + +### LightGBM + +[LightGBM](https://github.com/Microsoft/LightGBM) is an open-source, +distributed, high-performance gradient boosting (GBDT, GBRT, GBM, or +MART) framework. This framework specializes in creating high-quality and +GPU enabled decision tree algorithms for ranking, classification, and +many other machine learning tasks. LightGBM is part of Microsoft's +[DMTK](http://github.com/microsoft/dmtk) project. + +### Advantages of LightGBM through SynapseML + +- **Composability**: LightGBM models can be incorporated into existing + SparkML Pipelines, and used for batch, streaming, and serving + workloads. +- **Performance**: LightGBM on Spark is 10-30% faster than SparkML on + the Higgs dataset, and achieves a 15% increase in AUC. [Parallel + experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#parallel-experiment) + have verified that LightGBM can achieve a linear speed-up by using + multiple machines for training in specific settings. +- **Functionality**: LightGBM offers a wide array of [tunable + parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst), + that one can use to customize their decision tree system. LightGBM on + Spark also supports new types of problems such as quantile regression. +- **Cross platform** LightGBM on Spark is available on Spark, PySpark, and SparklyR + +### Usage + +In PySpark, you can run the `LightGBMClassifier` via: + +```python +from synapse.ml.lightgbm import LightGBMClassifier +model = LightGBMClassifier(learningRate=0.3, + numIterations=100, + numLeaves=31).fit(train) +``` + +Similarly, you can run the `LightGBMRegressor` by setting the +`application` and `alpha` parameters: + +```python +from synapse.ml.lightgbm import LightGBMRegressor +model = LightGBMRegressor(application='quantile', + alpha=0.3, + learningRate=0.3, + numIterations=100, + numLeaves=31).fit(train) +``` + +For an end to end application, check out the LightGBM [notebook +example](../Quickstart%20-%20Classification,%20Ranking,%20and%20Regression). + +### Arguments/Parameters + +SynapseML exposes getters/setters for many common LightGBM parameters. +In python, you can use property-value pairs, or in Scala use +fluent setters. Examples of both are shown in this section. + +```scala +import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier +val classifier = new LightGBMClassifier() + .setLearningRate(0.2) + .setNumLeaves(50) +``` + +LightGBM has far more parameters than SynapseML exposes. For cases where you +need to set some parameters that SynapseML doesn't expose a setter for, use +passThroughArgs. This argument is just a free string that you can use to add extra parameters +to the command SynapseML sends to configure LightGBM. + +In python: +```python +from synapse.ml.lightgbm import LightGBMClassifier +model = LightGBMClassifier(passThroughArgs="force_row_wise=true min_sum_hessian_in_leaf=2e-3", + numIterations=100, + numLeaves=31).fit(train) +``` + +In Scala: +```scala +import com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassifier +val classifier = new LightGBMClassifier() + .setPassThroughArgs("force_row_wise=true min_sum_hessian_in_leaf=2e-3") + .setLearningRate(0.2) + .setNumLeaves(50) +``` + +For formatting options and specific argument documentation, see +[LightGBM docs](https://lightgbm.readthedocs.io/en/v3.3.2/Parameters.html). SynapseML sets some +parameters specifically for the Spark distributed environment and +shouldn't be changed. Some parameters are for CLI mode only, and don't work within +Spark. + +You can mix *passThroughArgs* and explicit args, as shown in the example. SynapseML +merges them to create one argument string to send to LightGBM. If you set a parameter in +both places, *passThroughArgs* takes precedence. + +### Architecture + +LightGBM on Spark uses the Simple Wrapper and Interface Generator (SWIG) +to add Java support for LightGBM. These Java Binding use the Java Native +Interface call into the [distributed C++ +API](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h). + +We initialize LightGBM by calling +[`LGBM_NetworkInit`](https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/c_api.h) +with the Spark executors within a MapPartitions call. We then pass each +workers partitions into LightGBM to create the in-memory distributed +dataset for LightGBM. We can then train LightGBM to produce a model +that can then be used for inference. + +The `LightGBMClassifier` and `LightGBMRegressor` use the SparkML API, +inherit from the same base classes, integrate with SparkML pipelines, +and can be tuned with [SparkML's cross +validators](https://spark.apache.org/docs/latest/ml-tuning.html). + +Models built can be saved as SparkML pipeline with native LightGBM model +using `saveNativeModel()`. Additionally, they're fully compatible with [PMML](https://en.wikipedia.org/wiki/Predictive_Model_Markup_Language) and +can be converted to PMML format through the +[JPMML-SparkML-LightGBM](https://github.com/alipay/jpmml-sparkml-lightgbm) plugin. + +#### Dynamic Allocation Limitations +The native LightGBM library has a *distributed mode* that allows the algorithm to work over multiple *machines*. SynapseML +uses this mode to call LightGBM from Spark. SynapseML first gathers all the Spark executor networking information, passes that to LightGBM, and then +waits for LightGBM to complete its work. However, the native LightGBM algorithm implementation assumes all networking is constant over the time period of a single +training or scoring session. The native LightGBM distributed mode was designed this way and isn't a limitation of SynapseML by itself. + +Dynamic compute changes can cause LightGBM problems if the Spark executors change during data processing. Spark can naturally +take advantage of cluster autoscaling and can also dynamically replace any failed executor with another, but LightGBM can't +handle these networking changes. Large datasets are affected in particular since they're more likely to cause executor scaling +or have a single executor fail during a single processing pass. + +If you're experiencing problems with LightGBM as exposed through SynapseML due to executor changes (for example, occasional Task failures or networking hangs), +there are several options. +1. In the Spark platform, turn off any autoscaling on the cluster you have provisioned. +2. Set *numTasks* manually to be smaller so that fewer executors are used (reducing probability of single executor failure). +3. Turn off dynamic executor scaling with configuration in a notebook cell. In Synapse and Fabric, you can use: + +```python + %%configure + { + "conf": + { + "spark.dynamicAllocation.enabled": "false" + } + } +``` +Note: setting any custom configuration can affect cluster startup time if your compute platform takes advantage of "live pools" +to improve notebook performance. + +If you still have problems, you can consider splitting your data into smaller segments using *numBatches*. Splitting into multiple +batches increases total processing time, but can potentially be used to increase reliability. + +### Data Transfer Mode + +SynapseML must pass data from Spark partitions to LightGBM native Datasets before turning over control to +the actual LightGBM execution code for training and inference. SynapseML has two modes +that control how this data is transferred: *streaming* and *bulk*. +This mode doesn't affect training but can affect memory usage and overall fit/transform time. + +#### Bulk Execution mode +The "Bulk" mode is older and requires accumulating all data in executor memory before creating Datasets. This mode can cause +OOM errors for large data, especially since the data must be accumulated in its original uncompressed double-format size. +For now, "bulk" mode is the default since "streaming" is new, but SynapseML will eventually make streaming the default. + +For bulk mode, native LightGBM Datasets can either be created per partition (useSingleDatasetMode=false), or +per executor (useSingleDatasetMode=true). Generally, one Dataset per executor is more efficient since it reduces LightGBM network size and complexity during training or fitting. It also avoids using slow network protocols on partitions +that are actually on the same executor node. + +#### Streaming Execution Mode +The "streaming" execution mode uses new native LightGBM APIs created just for SynapseML that don't require loading extra copies of the data into memory. In particular, data is passed directly +from partitions to Datasets in small "micro-batches", similar to Spark streaming. The `microBatchSize` parameter controls the size of these micro-batches. +Smaller micro-batch sizes reduce memory overhead, but larger sizes avoid overhead from repeatedly transferring data to the native layer. The default +100, uses far less memory than bulk mode since only 100 rows of data will be loaded at a time. If your dataset has +few columns, you can increase the batch size. Alternatively, if +your dataset has a large number of columns you can decrease the micro-batch size to avoid OOM issues. + +These new streaming APIs in LightGBM are thread-safe, and allow all partitions in the same executor +to push data into a shared Dataset in parallel. Because of this, streaming mode always uses the more efficient +"useSingleDatasetMode=true", creating only one Dataset per executor. + +You can explicitly specify Execution Mode and MicroBatch size as parameters. + + val lgbm = new LightGBMClassifier() + .setExecutionMode("streaming") + .setMicroBatchSize(100) + .setLabelCol(labelColumn) + .setObjective("binary") + ... + + +For streaming mode, only one Dataset is created per partition, so *useSingleDataMode* has no effect. It's effectively always true. + +### Data Sampling + +In order for LightGBM algorithm to work, it must first create a set of bin boundaries for optimization. It does this calculation by +first sampling the data before any training or inferencing starts. ([LightGBM docs](https://github.com/Microsoft/LightGBM)). The number of +samples to use is set using *binSampleCount*, which must be a minimal percent of the data or LightGBM rejects it. + +For *bulk* mode, this sampling is automatically done over the entire data, and each executor uses its own partitions to calculate samples for only +a subset of the features. This distributed sampling can have subtle effects since partitioning can affect the calculated bins. +Also, all data is sampled no matter what. + +For *streaming* mode, there are more explicit user controls for this sampling, and it's all done from the driver. +The *samplingMode* property controls the behavior. The efficiency of these methods increases from first to last. +- *global* - Like bulk mode, the random sample is calculated by iterating over entire data (hence data is traversed twice) +- *subset* - (default) Samples only from the first *samplingSubsetSize* elements. Assumes this subset is representative. +- *fixed* - There's no random sample. The first *binSampleSize* rows are used. Assumes randomized data. +For large row counts, *subset* and *fixed* modes can save a first iteration over the entire data. + +#### Reference Dataset +The sampling of the data to calculate bin boundaries happens every *fit* call. +If repeating a fit many times (for example, hyperparameter tuning), this calculation is duplicated effort. + +For *streaming* mode, there's an optimization that a client can set to use the previously calculated bin boundaries. The +sampling calculation results in a *reference dataset*, which can be reused. After a fit, there will be a *referenceDataset* property +on the estimator that was calculated and used for that fit. If that is set on the next estimator (or you reuse the same one), +it will use that instead of resampling the data. + +```python +from synapse.ml.lightgbm import LightGBMClassifier +classifier = LightGBMClassifier(learningRate=0.3, + numIterations=100, + numLeaves=31) +model1 = classifier.fit(train) + +classifier.learningRate = 0.4 +model2 = classifier.fit(train) +``` +The 'model2' call to 'fit' doesn't resample the data and uses the same bin boundaries as 'model1'. + +*Caution*: Some parameters actually affect the bin boundary calculation and require the use of a new reference dataset every time. +These parameters include *isEnableSparse*, *useMissing*, and *zeroAsMissing* that you can set from SynapseML. If you manually set +some parameters with *passThroughArgs*, you should look at LightGBM docs to see if they affect bin boundaries. If you're setting +any parameter that affects bin boundaries and reusing the same estimator, you should set referenceDataset to an empty array between calls. + +### Barrier Execution Mode + +By default LightGBM uses the regular spark paradigm for launching tasks and communicates with the driver to coordinate task execution. +The driver thread aggregates all task host:port information and then communicates the full list back to the workers in order for NetworkInit to be called. +This procedure requires the driver to know how many tasks there are, and a mismatch between the expected number of tasks and the actual number causes +the initialization to deadlock. + +If you're experiencing network issues, you can try using Spark's *barrier* execution mode. SynapseML provides a `UseBarrierExecutionMode` flag, +to use Apache Spark's `barrier()` stage to ensure all tasks execute at the same time. +Barrier execution mode changes the logic to aggregate `host:port` information across all tasks in a synchronized way. +To use it in scala, you can call setUseBarrierExecutionMode(true), for example: + + val lgbm = new LightGBMClassifier() + .setLabelCol(labelColumn) + .setObjective(binaryObjective) + .setUseBarrierExecutionMode(true) + ... + +Note: barrier execution mode can also cause complicated issues, so use it only if needed. \ No newline at end of file diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.md new file mode 100644 index 0000000000..58d1a46c7d --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.md @@ -0,0 +1,302 @@ +--- +title: Quickstart - Classification, Ranking, and Regression +hide_title: true +status: stable +--- +# LightGBM + +## What is LightGBM + +[LightGBM](https://github.com/Microsoft/LightGBM) is an open-source, +distributed, high-performance gradient boosting (GBDT, GBRT, GBM, or +MART) framework. This framework specializes in creating high-quality and +GPU-enabled decision tree algorithms for ranking, classification, and +many other machine learning tasks. LightGBM is part of Microsoft's +[DMTK](https://github.com/microsoft/dmtk) project. + +### Advantages of LightGBM + +- **Composability**: LightGBM models can be incorporated into existing + SparkML pipelines and used for batch, streaming, and serving + workloads. +- **Performance**: LightGBM on Spark is 10-30% faster than SparkML on + the [Higgs dataset](https://archive.ics.uci.edu/dataset/280/higgs) and achieves a 15% increase in AUC. [Parallel + experiments](https://github.com/Microsoft/LightGBM/blob/master/docs/Experiments.rst#parallel-experiment) + have verified that LightGBM can achieve a linear speed-up by using + multiple machines for training in specific settings. +- **Functionality**: LightGBM offers a wide array of [tunable + parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst), + that one can use to customize their decision tree system. LightGBM on + Spark also supports new types of problems such as quantile regression. +- **Cross platform**: LightGBM on Spark is available on Spark, PySpark, and SparklyR. + +### LightGBM Usage + +- **LightGBMClassifier**: used for building classification models. For example, to predict whether a company bankrupts or not, we could build a binary classification model with `LightGBMClassifier`. +- **LightGBMRegressor**: used for building regression models. For example, to predict housing price, we could build a regression model with `LightGBMRegressor`. +- **LightGBMRanker**: used for building ranking models. For example, to predict the relevance of website search results, we could build a ranking model with `LightGBMRanker`. + +## Use `LightGBMClassifier` to train a classification model + +In this example, we use LightGBM to build a classification model in order to predict bankruptcy. + +### Read dataset + + +```python +from synapse.ml.core.platform import * +``` + + +```python +df = ( + spark.read.format("csv") + .option("header", True) + .option("inferSchema", True) + .load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv" + ) +) +# print dataset size +print("records read: " + str(df.count())) +print("Schema: ") +df.printSchema() +``` + + +```python +display(df) +``` + +### Split the dataset into train and test sets + + +```python +train, test = df.randomSplit([0.85, 0.15], seed=1) +``` + +### Add a featurizer to convert features into vectors + + +```python +from pyspark.ml.feature import VectorAssembler + +feature_cols = df.columns[1:] +featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features") +train_data = featurizer.transform(train)["Bankrupt?", "features"] +test_data = featurizer.transform(test)["Bankrupt?", "features"] +``` + +### Check if the data is unbalanced + + +```python +display(train_data.groupBy("Bankrupt?").count()) +``` + +### Model Training + + +```python +from synapse.ml.lightgbm import LightGBMClassifier + +model = LightGBMClassifier( + objective="binary", featuresCol="features", labelCol="Bankrupt?", isUnbalance=True +) +``` + + +```python +model = model.fit(train_data) +``` + +"saveNativeModel" allows you to extract the underlying lightGBM model for fast deployment after you train on Spark. + + +```python +from synapse.ml.lightgbm import LightGBMClassificationModel + +if running_on_synapse(): + model.saveNativeModel("/models/lgbmclassifier.model") + model = LightGBMClassificationModel.loadNativeModelFromFile( + "/models/lgbmclassifier.model" + ) +if running_on_synapse_internal(): + model.saveNativeModel("Files/models/lgbmclassifier.model") + model = LightGBMClassificationModel.loadNativeModelFromFile( + "Files/models/lgbmclassifier.model" + ) +else: + model.saveNativeModel("/tmp/lgbmclassifier.model") + model = LightGBMClassificationModel.loadNativeModelFromFile( + "/tmp/lgbmclassifier.model" + ) +``` + +### Visualize feature importance + + +```python +import pandas as pd +import matplotlib.pyplot as plt + +feature_importances = model.getFeatureImportances() +fi = pd.Series(feature_importances, index=feature_cols) +fi = fi.sort_values(ascending=True) +f_index = fi.index +f_values = fi.values + +# print feature importances +print("f_index:", f_index) +print("f_values:", f_values) + +# plot +x_index = list(range(len(fi))) +x_index = [x / len(fi) for x in x_index] +plt.rcParams["figure.figsize"] = (20, 20) +plt.barh( + x_index, f_values, height=0.028, align="center", color="tan", tick_label=f_index +) +plt.xlabel("importances") +plt.ylabel("features") +plt.show() +``` + +### Generate predictions with the model + + +```python +predictions = model.transform(test_data) +predictions.limit(10).toPandas() +``` + + +```python +from synapse.ml.train import ComputeModelStatistics + +metrics = ComputeModelStatistics( + evaluationMetric="classification", + labelCol="Bankrupt?", + scoredLabelsCol="prediction", +).transform(predictions) +display(metrics) +``` + +## Use `LightGBMRegressor` to train a quantile regression model + +In this example, we show how to use LightGBM to build a regression model. + +### Read dataset + + +```python +triazines = spark.read.format("libsvm").load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight" +) +``` + + +```python +# print some basic info +print("records read: " + str(triazines.count())) +print("Schema: ") +triazines.printSchema() +display(triazines.limit(10)) +``` + +### Split dataset into train and test sets + + +```python +train, test = triazines.randomSplit([0.85, 0.15], seed=1) +``` + +### Train the model using `LightGBMRegressor` + + +```python +from synapse.ml.lightgbm import LightGBMRegressor + +model = LightGBMRegressor( + objective="quantile", alpha=0.2, learningRate=0.3, numLeaves=31 +).fit(train) +``` + + +```python +print(model.getFeatureImportances()) +``` + +### Generate predictions with the model + + +```python +scoredData = model.transform(test) +display(scoredData) +``` + + +```python +from synapse.ml.train import ComputeModelStatistics + +metrics = ComputeModelStatistics( + evaluationMetric="regression", labelCol="label", scoresCol="prediction" +).transform(scoredData) +display(metrics) +``` + +## Use `LightGBMRanker` to train a ranking model + +### Read the dataset + + +```python +df = spark.read.format("parquet").load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet" +) +# print some basic info +print("records read: " + str(df.count())) +print("Schema: ") +df.printSchema() +display(df.limit(10)) +``` + +### Train the ranking model using `LightGBMRanker`. + + +```python +from synapse.ml.lightgbm import LightGBMRanker + +features_col = "features" +query_col = "query" +label_col = "labels" +lgbm_ranker = LightGBMRanker( + labelCol=label_col, + featuresCol=features_col, + groupCol=query_col, + predictionCol="preds", + leafPredictionCol="leafPreds", + featuresShapCol="importances", + repartitionByGroupingColumn=True, + numLeaves=32, + numIterations=200, + evalAt=[1, 3, 5], + metric="ndcg", +) +``` + + +```python +lgbm_ranker_model = lgbm_ranker.fit(df) +``` + +### Generate predictions with the model + + +```python +dt = spark.read.format("parquet").load( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet" +) +predictions = lgbm_ranker_model.transform(dt) +predictions.limit(10).toPandas() +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Langchain.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Langchain.md new file mode 100644 index 0000000000..9c25ecfa3c --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Langchain.md @@ -0,0 +1,236 @@ +--- +title: Langchain +hide_title: true +status: stable +--- +# Using the LangChain Transformer + +LangChain is a software development framework designed to simplify the creation of applications using large language models (LLMs). Chains in LangChain go beyond just a single LLM call and are sequences of calls (can be a call to an LLM or a different utility), automating the execution of a series of calls and actions. +To make it easier to scale up the LangChain execution on a large dataset, we have integrated LangChain with the distributed machine learning library [SynapseML](https://www.microsoft.com/en-us/research/blog/synapseml-a-simple-multilingual-and-massively-parallel-machine-learning-library/). This integration makes it easy to use the [Apache Spark](https://spark.apache.org/) distributed computing framework to process millions of data with the LangChain Framework. + +This tutorial shows how to apply LangChain at scale for paper summarization and organization. We start with a table of arxiv links and apply the LangChain Transformerto automatically extract the corresponding paper title, authors, summary, and some related works. + +## Step 1: Prerequisites + +The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. + +1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource) +1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace) +1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool) + +## Step 2: Import this guide as a notebook + +The next step is to add this code into your Spark cluster. You can either create a notebook in your Spark platform and copy the code into this notebook to run the demo. Or download the notebook and import it into Synapse Analytics + +1. Import the notebook into [Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/data-engineering/how-to-use-notebook), [Synapse Workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks#create-a-notebook) or if using Databricks into the [Databricks Workspace](https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-manage#create-a-notebook). +1. Install SynapseML on your cluster. Please see the installation instructions for Synapse at the bottom of [the SynapseML website](https://microsoft.github.io/SynapseML/). Note that this requires pasting an additional cell at the top of the notebook you just imported. +1. Connect your notebook to a cluster and follow along, editing and running the cells below. + + +```python +%pip install openai==0.28.1 langchain==0.0.331 pdf2image pdfminer.six unstructured==0.10.24 pytesseract numpy==1.22.4 +``` + + +```python +import os, openai, langchain, uuid +from langchain.llms import AzureOpenAI, OpenAI +from langchain.agents import load_tools, initialize_agent, AgentType +from langchain.chains import TransformChain, LLMChain, SimpleSequentialChain +from langchain.document_loaders import OnlinePDFLoader +from langchain.tools.bing_search.tool import BingSearchRun, BingSearchAPIWrapper +from langchain.prompts import PromptTemplate +from synapse.ml.services.langchain import LangchainTransformer +from synapse.ml.core.platform import running_on_synapse, find_secret +``` + +## Step 3: Fill in the service information and construct the LLM +Next, please edit the cell in the notebook to point to your service. In particular set the `model_name`, `deployment_name`, `openai_api_base`, and `open_api_key` variables to match those for your OpenAI service. Please feel free to replace `find_secret` with your key as follows + +`openai_api_key = "99sj2w82o...."` + +`bing_subscription_key = "..."` + +Note that you also need to set up your Bing search to gain access to your [Bing Search subscription key](https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource). + + +```python +openai_api_key = find_secret( + secret_name="openai-api-key-2", keyvault="mmlspark-build-keys" +) +openai_api_base = "https://synapseml-openai-2.openai.azure.com/" +openai_api_version = "2022-12-01" +openai_api_type = "azure" +deployment_name = "gpt-35-turbo" +bing_search_url = "https://api.bing.microsoft.com/v7.0/search" +bing_subscription_key = find_secret( + secret_name="bing-search-key", keyvault="mmlspark-build-keys" +) + +os.environ["BING_SUBSCRIPTION_KEY"] = bing_subscription_key +os.environ["BING_SEARCH_URL"] = bing_search_url +os.environ["OPENAI_API_TYPE"] = openai_api_type +os.environ["OPENAI_API_VERSION"] = openai_api_version +os.environ["OPENAI_API_BASE"] = openai_api_base +os.environ["OPENAI_API_KEY"] = openai_api_key + +llm = AzureOpenAI( + deployment_name=deployment_name, + model_name=deployment_name, + temperature=0.1, + verbose=True, +) +``` + +## Step 4: Basic Usage of LangChain Transformer + +### Create a chain +We will start by demonstrating the basic usage with a simple chain that creates definitions for input words + + +```python +copy_prompt = PromptTemplate( + input_variables=["technology"], + template="Define the following word: {technology}", +) + +chain = LLMChain(llm=llm, prompt=copy_prompt) +transformer = ( + LangchainTransformer() + .setInputCol("technology") + .setOutputCol("definition") + .setChain(chain) + .setSubscriptionKey(openai_api_key) + .setUrl(openai_api_base) +) +``` + +### Create a dataset and apply the chain + + +```python +# construction of test dataframe +df = spark.createDataFrame( + [(0, "docker"), (1, "spark"), (2, "python")], ["label", "technology"] +) +display(transformer.transform(df)) +``` + +### Save and load the LangChain transformer +LangChain Transformers can be saved and loaded. Note that LangChain serialization only works for chains that don't have memory. + + +```python +temp_dir = "tmp" +if not os.path.exists(temp_dir): + os.mkdir(temp_dir) +path = os.path.join(temp_dir, "langchainTransformer") +transformer.save(path) +loaded = LangchainTransformer.load(path) +display(loaded.transform(df)) +``` + +## Step 5: Using LangChain for Large scale literature review + +### Create a Sequential Chain for paper summarization + +We will now construct a Sequential Chain for extracting structured information from an arxiv link. In particular, we will ask langchain to extract the title, author information, and a summary of the paper content. After that, we use a web search tool to find the recent papers written by the first author. + +To summarize, our sequential chain contains the following steps: + +1. **Transform Chain**: Extract Paper Content from arxiv Link **=>** +1. **LLMChain**: Summarize the Paper, extract paper title and authors **=>** +1. **Transform Chain**: to generate the prompt **=>** +1. **Agent with Web Search Tool**: Use Web Search to find the recent papers by the first author + + +```python +def paper_content_extraction(inputs: dict) -> dict: + arxiv_link = inputs["arxiv_link"] + loader = OnlinePDFLoader(arxiv_link) + pages = loader.load_and_split() + return {"paper_content": pages[0].page_content + pages[1].page_content} + + +def prompt_generation(inputs: dict) -> dict: + output = inputs["Output"] + prompt = ( + "find the paper title, author, summary in the paper description below, output them. After that, Use websearch to find out 3 recent papers of the first author in the author section below (first author is the first name separated by comma) and list the paper titles in bullet points: \n" + + output + + "." + ) + return {"prompt": prompt} + + +paper_content_extraction_chain = TransformChain( + input_variables=["arxiv_link"], + output_variables=["paper_content"], + transform=paper_content_extraction, + verbose=False, +) + +paper_summarizer_template = """You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, and extract authors and paper title from the paper content. +Here is the paper content: +{paper_content} +Output: +paper title, authors and summary. +""" +prompt = PromptTemplate( + input_variables=["paper_content"], template=paper_summarizer_template +) +summarize_chain = LLMChain(llm=llm, prompt=prompt, verbose=False) + +prompt_generation_chain = TransformChain( + input_variables=["Output"], + output_variables=["prompt"], + transform=prompt_generation, + verbose=False, +) + +bing = BingSearchAPIWrapper(k=3) +tools = [BingSearchRun(api_wrapper=bing)] +web_search_agent = initialize_agent( + tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False +) + +sequential_chain = SimpleSequentialChain( + chains=[ + paper_content_extraction_chain, + summarize_chain, + prompt_generation_chain, + web_search_agent, + ] +) +``` + +### Apply the LangChain transformer to perform this workload at scale + +We can now use our chain at scale using the `LangchainTransformer` + + +```python +paper_df = spark.createDataFrame( + [ + (0, "https://arxiv.org/pdf/2107.13586.pdf"), + (1, "https://arxiv.org/pdf/2101.00190.pdf"), + (2, "https://arxiv.org/pdf/2103.10385.pdf"), + (3, "https://arxiv.org/pdf/2110.07602.pdf"), + ], + ["label", "arxiv_link"], +) + +# construct langchain transformer using the paper summarizer chain define above +paper_info_extractor = ( + LangchainTransformer() + .setInputCol("arxiv_link") + .setOutputCol("paper_info") + .setChain(sequential_chain) + .setSubscriptionKey(openai_api_key) + .setUrl(openai_api_base) +) + + +# extract paper information from arxiv links, the paper information needs to include: +# paper title, paper authors, brief paper summary, and recent papers published by the first author +display(paper_info_extractor.transform(paper_df)) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/OpenAI.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/OpenAI.md new file mode 100644 index 0000000000..792c402ad1 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/OpenAI.md @@ -0,0 +1,294 @@ +--- +title: OpenAI +hide_title: true +status: stable +--- +# Azure OpenAI for big data + +The Azure OpenAI service can be used to solve a large number of natural language tasks through prompting the completion API. To make it easier to scale your prompting workflows from a few examples to large datasets of examples, we have integrated the Azure OpenAI service with the distributed machine learning library [SynapseML](https://www.microsoft.com/en-us/research/blog/synapseml-a-simple-multilingual-and-massively-parallel-machine-learning-library/). This integration makes it easy to use the [Apache Spark](https://spark.apache.org/) distributed computing framework to process millions of prompts with the OpenAI service. This tutorial shows how to apply large language models at a distributed scale using Azure OpenAI. + +## Prerequisites + +The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. + +1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource) +1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace) +1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool) + + +## Import this guide as a notebook + +The next step is to add this code into your Spark cluster. You can either create a notebook in your Spark platform and copy the code into this notebook to run the demo. Or download the notebook and import it into Synapse Analytics + +- [Download this demo as a notebook](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/OpenAI.ipynb) (select **Raw**, then save the file) +- Import the notebook. + * If you are using Synapse Analytics [into the Synapse Workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks#create-a-notebook) + * If your are using Databricks [import into the Databricks Workspace](https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-manage#create-a-notebook). + * If you are using Fabric [import into the Fabric Workspace](https://learn.microsoft.com/en-us/fabric/data-engineering/how-to-use-notebook) +- Install SynapseML on your cluster. See the installation instructions for Synapse at the bottom of [the SynapseML website](https://microsoft.github.io/SynapseML/). + * If you are using Fabric, please check [Installation Guide](https://learn.microsoft.com/en-us/fabric/data-science/install-synapseml). This requires pasting an extra cell at the top of the notebook you imported. +- Connect your notebook to a cluster and follow along, editing and running the cells. + +## Fill in service information + +Next, edit the cell in the notebook to point to your service. In particular set the `service_name`, `deployment_name`, `location`, and `key` variables to match them to your OpenAI service: + + +```python +from synapse.ml.core.platform import find_secret + +# Fill in the following lines with your service information +# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model +service_name = "synapseml-openai-2" +deployment_name = "gpt-35-turbo" +deployment_name_embeddings = "text-embedding-ada-002" + +key = find_secret( + secret_name="openai-api-key-2", keyvault="mmlspark-build-keys" +) # please replace this line with your key as a string + +assert key is not None and service_name is not None +``` + +## Create a dataset of prompts + +Next, create a dataframe consisting of a series of rows, with one prompt per row. + +You can also load data directly from ADLS or other databases. For more information on loading and preparing Spark dataframes, see the [Apache Spark data loading guide](https://spark.apache.org/docs/latest/sql-data-sources.html). + + +```python +df = spark.createDataFrame( + [ + ("Hello my name is",), + ("The best code is code thats",), + ("SynapseML is ",), + ] +).toDF("prompt") +``` + +## Create the OpenAICompletion Apache Spark Client + +To apply the OpenAI Completion service to your dataframe you created, create an OpenAICompletion object, which serves as a distributed client. Parameters of the service can be set either with a single value, or by a column of the dataframe with the appropriate setters on the `OpenAICompletion` object. Here we're setting `maxTokens` to 200. A token is around four characters, and this limit applies to the sum of the prompt and the result. We're also setting the `promptCol` parameter with the name of the prompt column in the dataframe. + + +```python +from synapse.ml.services.openai import OpenAICompletion + +completion = ( + OpenAICompletion() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name) + .setCustomServiceName(service_name) + .setMaxTokens(200) + .setPromptCol("prompt") + .setErrorCol("error") + .setOutputCol("completions") +) +``` + +## Transform the dataframe with the OpenAICompletion Client + +After creating the dataframe and the completion client, you can transform your input dataset and add a column called `completions` with all of the information the service adds. Select just the text for simplicity. + + +```python +from pyspark.sql.functions import col + +completed_df = completion.transform(df).cache() +display( + completed_df.select( + col("prompt"), + col("error"), + col("completions.choices.text").getItem(0).alias("text"), + ) +) +``` + +Your output should look something like this. The completion text will be different from the sample. + +| **prompt** | **error** | **text** | +|:----------------------------: |:----------: |:-------------------------------------------------------------------------------------------------------------------------------------: | +| Hello my name is | null | Makaveli I'm eighteen years old and I want to be a rapper when I grow up I love writing and making music I'm from Los Angeles, CA | +| The best code is code thats | null | understandable This is a subjective statement, and there is no definitive answer. | +| SynapseML is | null | A machine learning algorithm that is able to learn how to predict the future outcome of events. | + +## More Usage Examples + +### Generating Text Embeddings + +In addition to completing text, we can also embed text for use in downstream algorithms or vector retrieval architectures. Creating embeddings allows you to search and retrieve documents from large collections and can be used when prompt engineering isn't sufficient for the task. + +For more information on using `OpenAIEmbedding` see our [embedding guide](./Quickstart%20-%20OpenAI%20Embedding). + + +```python +from synapse.ml.services.openai import OpenAIEmbedding + +embedding = ( + OpenAIEmbedding() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name_embeddings) + .setCustomServiceName(service_name) + .setTextCol("prompt") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +display(embedding.transform(df)) +``` + +### Chat Completion + +Models such as ChatGPT and GPT-4 are capable of understanding chats instead of single prompts. The `OpenAIChatCompletion` transformer exposes this functionality at scale. + + +```python +from synapse.ml.services.openai import OpenAIChatCompletion +from pyspark.sql import Row +from pyspark.sql.types import * + + +def make_message(role, content): + return Row(role=role, content=content, name=role) + + +chat_df = spark.createDataFrame( + [ + ( + [ + make_message( + "system", "You are an AI chatbot with red as your favorite color" + ), + make_message("user", "Whats your favorite color"), + ], + ), + ( + [ + make_message("system", "You are very excited"), + make_message("user", "How are you today"), + ], + ), + ] +).toDF("messages") + + +chat_completion = ( + OpenAIChatCompletion() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name) + .setCustomServiceName(service_name) + .setMessagesCol("messages") + .setErrorCol("error") + .setOutputCol("chat_completions") +) + +display( + chat_completion.transform(chat_df).select( + "messages", "chat_completions.choices.message.content" + ) +) +``` + +### Improve throughput with request batching + +The example makes several requests to the service, one for each prompt. To complete multiple prompts in a single request, use batch mode. First, in the OpenAICompletion object, instead of setting the Prompt column to "Prompt", specify "batchPrompt" for the BatchPrompt column. +To do so, create a dataframe with a list of prompts per row. + +As of this writing there's currently a limit of 20 prompts in a single request, and a hard limit of 2048 "tokens", or approximately 1500 words. + + +```python + +``` + + +```python +batch_df = spark.createDataFrame( + [ + (["The time has come", "Pleased to", "Today stocks", "Here's to"],), + (["The only thing", "Ask not what", "Every litter", "I am"],), + ] +).toDF("batchPrompt") +``` + +Next we create the OpenAICompletion object. Rather than setting the prompt column, set the batchPrompt column if your column is of type `Array[String]`. + + +```python +batch_completion = ( + OpenAICompletion() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name) + .setCustomServiceName(service_name) + .setMaxTokens(200) + .setBatchPromptCol("batchPrompt") + .setErrorCol("error") + .setOutputCol("completions") +) +``` + +In the call to transform, a request will be made per row. Since there are multiple prompts in a single row, each request is sent with all prompts in that row. The results contain a row for each row in the request. + + +```python +completed_batch_df = batch_completion.transform(batch_df).cache() +display(completed_batch_df) +``` + +### Using an automatic minibatcher + +If your data is in column format, you can transpose it to row format using SynapseML's `FixedMiniBatcherTransformer`. + + +```python +from pyspark.sql.types import StringType +from synapse.ml.stages import FixedMiniBatchTransformer +from synapse.ml.core.spark import FluentAPI + +completed_autobatch_df = ( + df.coalesce( + 1 + ) # Force a single partition so that our little 4-row dataframe makes a batch of size 4, you can remove this step for large datasets + .mlTransform(FixedMiniBatchTransformer(batchSize=4)) + .withColumnRenamed("prompt", "batchPrompt") + .mlTransform(batch_completion) +) + +display(completed_autobatch_df) +``` + +### Prompt engineering for translation + +The Azure OpenAI service can solve many different natural language tasks through [prompt engineering](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/completions). Here, we show an example of prompting for language translation: + + +```python +translate_df = spark.createDataFrame( + [ + ("Japanese: Ookina hako \nEnglish: Big box \nJapanese: Midori tako\nEnglish:",), + ( + "French: Quel heure et il au Montreal? \nEnglish: What time is it in Montreal? \nFrench: Ou est le poulet? \nEnglish:", + ), + ] +).toDF("prompt") + +display(completion.transform(translate_df)) +``` + +### Prompt for question answering + +Here, we prompt GPT-3 for general-knowledge question answering: + + +```python +qa_df = spark.createDataFrame( + [ + ( + "Q: Where is the Grand Canyon?\nA: The Grand Canyon is in Arizona.\n\nQ: What is the weight of the Burj Khalifa in kilograms?\nA:", + ) + ] +).toDF("prompt") + +display(completion.transform(qa_df)) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.md new file mode 100644 index 0000000000..abe540af25 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.md @@ -0,0 +1,215 @@ +--- +title: Quickstart - Custom Embeddings and Approximate KNN on GPU +hide_title: true +status: stable +--- +# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN) + +The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb) + +The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages. + + +## Step 1: Prepare Environment + +It will imports required libraries and get initial settings + + +```python +import torch +import sys +import pyspark.sql.functions as F +from pyspark.sql.types import StructType, StructField, IntegerType, StringType +from pyspark.ml.linalg import Vectors +from pyspark.ml.linalg import VectorUDT +from spark_rapids_ml.knn import ( + ApproximateNearestNeighbors, + ApproximateNearestNeighborsModel, +) +from synapse.ml.hf import HuggingFaceSentenceEmbedder +from synapse.ml.nn import KNN +``` + +## Step 2: Load Input Data + +It will load public dataset and generate extra syntetic rows if set by size parameter + +The loaded dataset has 1000 rows. If you specify number_of_input_rows in [1..1000] it will cut extra rows if needed + +If number_of_input_rows in [1000..1000000] it will generate extra rows using cross join of original data + + +```python +file_path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv" + +df = spark.read.options(inferSchema="True", delimiter=",", header=True).csv(file_path) +df = df.withColumn( + "data", + F.format_string("Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text)), +) + +# Size of DF +number_of_input_rows = 100 + + +# Check if the row count is less than 10 +if number_of_input_rows <= 0 or number_of_input_rows >= 1000000: + raise ValueError(f"Limit is {number_of_input_rows}, which should be less than 1M.") + +if number_of_input_rows > 1000: + + # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data) + cross_joined_df = df.crossJoin(df.withColumnRenamed("data", "data_")) + + # Create a new column 'result_vector' by concatenating the two source vectors + tmp_df = cross_joined_df.withColumn( + "result_vector", + F.concat(F.col("data"), F.lit(". \n"), F.col("data_")), + ) + + # Select only the necessary columns and show the result + tmp_df = tmp_df.select("result_vector") + + # Shuffle the DataFrame with a fixed seed to have close strings spreaded + seed = 42 + + df = ( + tmp_df.withColumnRenamed("result_vector", "data") + .withColumn("id", F.monotonically_increasing_id()) + .orderBy(F.rand(seed)) + ) + +df = df.limit(number_of_input_rows).repartition(10).cache() + +print(f"Loaded: {number_of_input_rows} rows") +``` + +## Step 3: Generate Embeddings + +We will first generate embeddings using NVIDIA TensorRT optimized SentenceTransformer. In the demo you can use two fifferent HF models: intfloat/e5-large-v2 or sentence-transformers/all-MiniLM-L6-v2" + + +```python +# To create embedder with different models, uncomment the following line +# embedder = HuggingFaceSentenceEmbedder(modelName="intfloat/e5-large-v2", inputCol="data", outputCol="embeddings", runtime="tensorrt") +embedder = HuggingFaceSentenceEmbedder( + modelName="sentence-transformers/all-MiniLM-L6-v2", + inputCol="data", + outputCol="embeddings", + runtime="tensorrt", +) + +embeddings = embedder.transform(df).select("id", "embeddings").cache() +``` + +## Step 4: Build the query against embeddings + +Get query embeddings running standard SentenceTransformer just on the driver. Convert embedding results to a data frame + + +```python +# Sample query +queries = ["desserts", "disgusting"] +ids = [1, 2] + +# Create DataFrame directly from the data and schema +query_df = spark.createDataFrame( + list(zip(ids, queries)), + StructType( + [ + StructField("id", IntegerType(), nullable=False), + StructField("data", StringType(), nullable=False), + ] + ), +) + +query_embeddings = embedder.transform(query_df).select("id", "embeddings").cache() +``` + +## Step 5: Build a fast vector index to over review embeddings + +We will use fast NVIDIA Rapids indexer. This KNN implementation will work only on GPU. If you want to use CPU then switch to synapse.ml.nn CPU based KNN implementation + + +```python +RUN_ON_GPU = torch.cuda.is_available() +``` + + +```python +if RUN_ON_GPU: + rapids_knn_model = ( + ApproximateNearestNeighbors(k=5) + .setInputCol("embeddings") + .setIdCol("id") + .fit(embeddings) + ) +else: + array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT()) + df_with_vectors = embeddings.withColumn( + "features", array_to_vector_udf(embeddings["embeddings"]) + ) + knn = ( + KNN() + .setFeaturesCol("features") + .setValuesCol("id") + .setOutputCol("output") + .setK(10) + ) + knn_model = knn.fit(df_with_vectors) +``` + +## Step 6: Find top k Nearest Neighbors ON GPU + +We will use fast ANN [IVFFlat algorithm](https://developer.nvidia.com/blog/accelerated-vector-search-approximating-with-rapids-raft-ivf-flat/) from Rapids + + +```python +if RUN_ON_GPU: + (_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings) +else: + array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT()) + df_with_vectors = query_embeddings.withColumn( + "features", array_to_vector_udf(query_embeddings["embeddings"]) + ) + knn_df = knn_model.transform(df_with_vectors) +``` + +## Step 7: Collect and display results + + +```python +if RUN_ON_GPU: + result_df = ( + knn_df.withColumn( + "zipped", F.explode(F.arrays_zip(F.col("indices"), F.col("distances"))) + ) + .select( + F.col("query_id"), + F.col("zipped.indices").alias("id"), + F.col("zipped.distances").alias("distance"), + ) + .join(df, on="id", how="inner") + .select("query_id", "id", "data", "distance") + ) +else: + knn_df = knn_df.withColumnRenamed("data", "original_data") + result_df = ( + knn_df.withColumn("match", F.explode("output")) + .join(df, df["id"] == F.col("match.value")) + .select("original_data", F.col("data"), "match.distance") + ) + +display(result_df) +``` + +# Results + +The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets. + +This is the comparison dureation results on 10 T4 GPU nodes for both approaches: + +![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png) + + + diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.md new file mode 100644 index 0000000000..f98609e2eb --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.md @@ -0,0 +1,202 @@ +--- +title: Quickstart - OpenAI Embedding and GPU based KNN +hide_title: true +status: stable +--- +# Embedding Text with Azure OpenAI and GPU based KNN + +The Azure OpenAI service can be used to solve a large number of natural language tasks through prompting the completion API. To make it easier to scale your prompting workflows from a few examples to large datasets of examples we have integrated the Azure OpenAI service with the distributed machine learning library [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml/). This integration makes it easy to use the [Apache Spark](https://spark.apache.org/) distributed computing framework to process millions of prompts with the OpenAI service. This tutorial shows how to apply large language models to generate embeddings for large datasets of text. This demo is based on "Quickstart - OpenAI Embedding" notebook with NVIDIA GPU accelerated KNN. + +**Note**: Running the notebook with the demo dataset (Step 4) will generate the same results as CPU based “Quickstart - OpenAI Embedding” notebook. To see GPU acceleration you need to run query against bigger embeddings. +For example, running 100K rows dataset will give 6x acceleration and consume less than 10x memory on 2 nodes NVIDIA T4 cluster compare to AMD Epic (Rome) 2 nodes CPU cluster. + +## Step 1: Prerequisites + +The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. + +1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource) +1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace) +1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool) + + +## Step 2: Import this guide as a notebook + +The next step is to add this code into your Spark cluster. You can either create a notebook in your Spark platform and copy the code into this notebook to run the demo. Or download the notebook and import it into Synapse Analytics + +1. [Download this demo as a notebook](https://github.com/microsoft/SynapseML/blob/master/notebooks/features/cognitive_services/CognitiveServices%20-%20OpenAI%20Embedding.ipynb) (click Raw, then save the file) +1. Import the notebook [into the Synapse Workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks#create-a-notebook) or if using Databricks [into the Databricks Workspace](https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-manage#create-a-notebook) +1. Install SynapseML on your cluster. Please see the installation instructions for Synapse at the bottom of [the SynapseML website](https://microsoft.github.io/SynapseML/). Note that this requires pasting an additional cell at the top of the notebook you just imported +3. To run the notebook on Databricks add related init script (/tools/init_scripts/init-rapidsml-cuda-11.8.sh) +4. Connect your notebook to a cluster and follow along, editing and rnnung the cells below. + +## Step 3: Fill in your service information + +Next, please edit the cell in the notebook to point to your service. In particular set the `service_name`, `deployment_name`, `location`, and `key` variables to match those for your OpenAI service + + + +```python +from synapse.ml.core.platform import find_secret + +# Fill in the following lines with your service information +# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model +service_name = "synapseml-openai-2" +deployment_name_embeddings = "text-embedding-ada-002" + +key = find_secret( + secret_name="openai-api-key-2", keyvault="mmlspark-build-keys" +) # please replace this with your key as a string + +assert key is not None and service_name is not None +``` + +## Step 4: Load Data + +In this demo we will explore a dataset of fine food reviews + + +```python +import pyspark.sql.functions as F + +df = ( + spark.read.options(inferSchema="True", delimiter=",", header=True) + .csv("wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv") + .repartition(5) +) + +df = df.withColumn( + "combined", + F.format_string("Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text)), +) + +display(df) +``` + +## Step 5: Generate Embeddings + +We will first generate embeddings for the reviews using the SynapseML OpenAIEmbedding client. + + +```python +from synapse.ml.services.openai import OpenAIEmbedding + +embedding = ( + OpenAIEmbedding() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name_embeddings) + .setCustomServiceName(service_name) + .setTextCol("combined") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +completed_df = embedding.transform(df).cache() +display(completed_df) +``` + +## Step 6: Reduce Embedding dimensionality for Visualization +We reduce the dimensionality to 2 dimensions using t-SNE decomposition. + + +```python +import pandas as pd +from sklearn.manifold import TSNE +import numpy as np + +collected = list(completed_df.collect()) +matrix = np.array([[r["embeddings"]] for r in collected])[:, 0, :].astype(np.float64) +scores = np.array([[r["Score"]] for r in collected]).reshape(-1) + +tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="pca") +vis_dims = tsne.fit_transform(matrix) +vis_dims.shape +``` + +## Step 7: Plot the embeddings + +We now use t-SNE to reduce the dimensionality of the embeddings from 1536 to 2. Once the embeddings are reduced to two dimensions, we can plot them in a 2D scatter plot. We colour each review by its star rating, ranging from red for negative reviews, to green for positive reviews. We can observe a decent data separation even in the reduced 2 dimensions. + + +```python +import matplotlib.pyplot as plt +import matplotlib +import numpy as np + +colors = ["red", "darkorange", "gold", "turquoise", "darkgreen"] +x = [x for x, y in vis_dims] +y = [y for x, y in vis_dims] +color_indices = scores - 1 + +colormap = matplotlib.colors.ListedColormap(colors) +plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3) +for score in [0, 1, 2, 3, 4]: + avg_x = np.array(x)[scores - 1 == score].mean() + avg_y = np.array(y)[scores - 1 == score].mean() + color = colors[score] + plt.scatter(avg_x, avg_y, marker="x", color=color, s=100) + +plt.title("Amazon ratings visualized in language using t-SNE") +``` + +## Step 8: Build the query against embeddings + +Note: The data types of the ID columns in the document and query dataframes should be the same. For some OpenAI models, users should use separate models for embedding documents and queries. These models are denoted by the "-doc" and "-query" suffixes respectively. + + +```python +from pyspark.ml import PipelineModel + +embedding_query = ( + OpenAIEmbedding() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name_embeddings) + .setCustomServiceName(service_name) + .setTextCol("query") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +query_df = ( + spark.createDataFrame( + [ + ( + 0, + "desserts", + ), + ( + 1, + "disgusting", + ), + ] + ) + .toDF("id", "query") + .withColumn("id", F.col("id").cast("int")) +) + +embedding_query_df = ( + embedding_query.transform(query_df).select("id", "embeddings").cache() +) +``` + +## Step 9: Fit KNN model +Build KNN model using fit method + + +```python +from spark_rapids_ml.knn import NearestNeighbors + +rapids_knn = NearestNeighbors(k=10) +rapids_knn.setInputCol("embeddings").setIdCol("id") + +rapids_knn_model = rapids_knn.fit(completed_df.select("id", "embeddings")) +``` + +## Step 10: Retrieve query results +Find k nearest neighbors + + +```python +(_, _, knn_df) = rapids_knn_model.kneighbors(embedding_query_df) + +display(knn_df) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.md new file mode 100644 index 0000000000..c6ff048e18 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.md @@ -0,0 +1,207 @@ +--- +title: Quickstart - OpenAI Embedding +hide_title: true +status: stable +--- +# Embedding Text with Azure OpenAI + +The Azure OpenAI service can be used to solve a large number of natural language tasks through prompting the completion API. To make it easier to scale your prompting workflows from a few examples to large datasets of examples we have integrated the Azure OpenAI service with the distributed machine learning library [SynapseML](https://www.microsoft.com/en-us/research/blog/synapseml-a-simple-multilingual-and-massively-parallel-machine-learning-library/). This integration makes it easy to use the [Apache Spark](https://spark.apache.org/) distributed computing framework to process millions of prompts with the OpenAI service. This tutorial shows how to apply large language models to generate embeddings for large datasets of text. + +## Step 1: Prerequisites + +The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but an Azure Databricks, HDInsight, or Spark on Kubernetes, or even a python environment with the `pyspark` package will work. + +1. An Azure OpenAI resource – request access [here](https://customervoice.microsoft.com/Pages/ResponsePage.aspx?id=v4j5cvGGr0GRqy180BHbR7en2Ais5pxKtso_Pz4b1_xUOFA5Qk1UWDRBMjg0WFhPMkIzTzhKQ1dWNyQlQCN0PWcu) before [creating a resource](https://docs.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource) +1. [Create a Synapse workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-create-workspace) +1. [Create a serverless Apache Spark pool](https://docs.microsoft.com/en-us/azure/synapse-analytics/get-started-analyze-spark#create-a-serverless-apache-spark-pool) + + +## Step 2: Import this guide as a notebook + +The next step is to add this code into your Spark cluster. You can either create a notebook in your Spark platform and copy the code into this notebook to run the demo. Or download the notebook and import it into Synapse Analytics + +1. [Download this demo as a notebook](https://github.com/microsoft/SynapseML/blob/master/notebooks/features/cognitive_services/CognitiveServices%20-%20OpenAI%20Embedding.ipynb) (click Raw, then save the file) +1. Import the notebook [into the Synapse Workspace](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-development-using-notebooks#create-a-notebook) or if using Databricks [into the Databricks Workspace](https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-manage#create-a-notebook) +1. Install SynapseML on your cluster. Please see the installation instructions for Synapse at the bottom of [the SynapseML website](https://microsoft.github.io/SynapseML/). Note that this requires pasting an additional cell at the top of the notebook you just imported +3. Connect your notebook to a cluster and follow along, editing and rnnung the cells below. + +## Step 3: Fill in your service information + +Next, please edit the cell in the notebook to point to your service. In particular set the `service_name`, `deployment_name`, `location`, and `key` variables to match those for your OpenAI service: + + +```python +from synapse.ml.core.platform import find_secret + +# Fill in the following lines with your service information +# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model +service_name = "synapseml-openai-2" +deployment_name_embeddings = "text-embedding-ada-002" + +key = find_secret( + secret_name="openai-api-key-2", keyvault="mmlspark-build-keys" +) # please replace this with your key as a string + +assert key is not None and service_name is not None +``` + +## Step 4: Load Data + +In this demo we will explore a dataset of fine food reviews + + +```python +import pyspark.sql.functions as F + +df = ( + spark.read.options(inferSchema="True", delimiter=",", header=True) + .csv("wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv") + .repartition(5) +) + +df = df.withColumn( + "combined", + F.format_string("Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text)), +) + +display(df) +``` + +## Step 5: Generate Embeddings + +We will first generate embeddings for the reviews using the SynapseML OpenAIEmbedding client. + + +```python +from synapse.ml.services.openai import OpenAIEmbedding + +embedding = ( + OpenAIEmbedding() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name_embeddings) + .setCustomServiceName(service_name) + .setTextCol("combined") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +completed_df = embedding.transform(df).cache() +display(completed_df) +``` + +## Step 6: Reduce Embedding dimensionality for Visualization +We reduce the dimensionality to 2 dimensions using t-SNE decomposition. + + +```python +import pandas as pd +from sklearn.manifold import TSNE +import numpy as np + +collected = list(completed_df.collect()) +matrix = np.array([[r["embeddings"]] for r in collected])[:, 0, :].astype(np.float64) +scores = np.array([[r["Score"]] for r in collected]).reshape(-1) + +tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="pca") +vis_dims = tsne.fit_transform(matrix) +vis_dims.shape +``` + +## Step 7: Plot the embeddings + +We now use t-SNE to reduce the dimensionality of the embeddings from 1536 to 2. Once the embeddings are reduced to two dimensions, we can plot them in a 2D scatter plot. We colour each review by its star rating, ranging from red for negative reviews, to green for positive reviews. We can observe a decent data separation even in the reduced 2 dimensions. + + +```python +import matplotlib.pyplot as plt +import matplotlib +import numpy as np + +colors = ["red", "darkorange", "gold", "turquoise", "darkgreen"] +x = [x for x, y in vis_dims] +y = [y for x, y in vis_dims] +color_indices = scores - 1 + +colormap = matplotlib.colors.ListedColormap(colors) +plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3) +for score in [0, 1, 2, 3, 4]: + avg_x = np.array(x)[scores - 1 == score].mean() + avg_y = np.array(y)[scores - 1 == score].mean() + color = colors[score] + plt.scatter(avg_x, avg_y, marker="x", color=color, s=100) + +plt.title("Amazon ratings visualized in language using t-SNE") +``` + +## Step 8: Build a fast vector index to over review embeddings + +We will use SynapseML's KNN estimator to build a fast cosine-similarity retrieval engine. + + +```python +from synapse.ml.nn import * + +knn = ( + KNN() + .setFeaturesCol("embeddings") + .setValuesCol("id") + .setOutputCol("output") + .setK(10) +) + +knn_index = knn.fit(completed_df) +``` + +## Step 8: Build the retrieval model pipeline + +Note: The data types of the ID columns in the document and query dataframes should be the same. For some OpenAI models, users should use separate models for embedding documents and queries. These models are denoted by the "-doc" and "-query" suffixes respectively. + + +```python +from pyspark.ml import PipelineModel + +embedding_query = ( + OpenAIEmbedding() + .setSubscriptionKey(key) + .setDeploymentName(deployment_name_embeddings) + .setCustomServiceName(service_name) + .setTextCol("query") + .setErrorCol("error") + .setOutputCol("embeddings") +) + +retrieval_model = PipelineModel(stages=[embedding_query, knn_index]) +``` + +## Step 9: Retrieve results + + +```python +query_df = ( + spark.createDataFrame( + [ + ( + 0, + "desserts", + ), + ( + 1, + "disgusting", + ), + ] + ) + .toDF("id", "query") + .withColumn("id", F.col("id").cast("int")) +) + + +df_matches = retrieval_model.transform(query_df).cache() + +df_result = ( + df_matches.withColumn("match", F.explode("output")) + .join(df, df["id"] == F.col("match.value")) + .select("query", F.col("combined"), "match.distance") +) + +display(df_result) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.md new file mode 100644 index 0000000000..88de204189 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.md @@ -0,0 +1,376 @@ +--- +title: Quickstart - Understand and Search Forms +hide_title: true +status: stable +--- +# Tutorial: Create a custom search engine and question-answering system + +In this tutorial, learn how to index and query large data loaded from a Spark cluster. You set up a Jupyter Notebook that performs the following actions: + +> + Load various forms (invoices) into a data frame in an Apache Spark session +> + Analyze them to determine their features +> + Assemble the resulting output into a tabular data structure +> + Write the output to a search index hosted in Azure Cognitive Search +> + Explore and query over the content you created + +## 1 - Set up dependencies + +We start by importing packages and connecting to the Azure resources used in this workflow. + + +```python +%pip install openai==0.28.1 +``` + + +```python +from synapse.ml.core.platform import find_secret + +cognitive_key = find_secret( + secret_name="ai-services-api-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. e.g. cognitive_key="27snaiw..." +cognitive_location = "eastus" + +translator_key = find_secret( + secret_name="translator-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. +translator_location = "eastus" + +search_key = find_secret( + secret_name="azure-search-key", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. +search_service = "mmlspark-azure-search" +search_index = "form-demo-index-5" + +openai_key = find_secret( + secret_name="openai-api-key-2", keyvault="mmlspark-build-keys" +) # Replace the call to find_secret with your key as a python string. +openai_service_name = "synapseml-openai-2" +openai_deployment_name = "gpt-35-turbo" +openai_url = f"https://{openai_service_name}.openai.azure.com/" +``` + +## 2 - Load data into Spark + +This code loads a few external files from an Azure storage account that's used for demo purposes. The files are various invoices, and they're read into a data frame. + + +```python +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType + + +def blob_to_url(blob): + [prefix, postfix] = blob.split("@") + container = prefix.split("/")[-1] + split_postfix = postfix.split("/") + account = split_postfix[0] + filepath = "/".join(split_postfix[1:]) + return "https://{}/{}/{}".format(account, container, filepath) + + +df2 = ( + spark.read.format("binaryFile") + .load("wasbs://publicwasb@mmlspark.blob.core.windows.net/form_subset/*") + .select("path") + .limit(10) + .select(udf(blob_to_url, StringType())("path").alias("url")) + .cache() +) + +display(df2) +``` + + + +## 3 - Apply form recognition + +This code loads the [AnalyzeInvoices transformer](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/AI%20Services/Overview/#form-recognizer) and passes a reference to the data frame containing the invoices. It calls the pre-built invoice model of Azure Forms Analyzer. + + +```python +from synapse.ml.services.form import AnalyzeInvoices + +analyzed_df = ( + AnalyzeInvoices() + .setSubscriptionKey(cognitive_key) + .setLocation(cognitive_location) + .setImageUrlCol("url") + .setOutputCol("invoices") + .setErrorCol("errors") + .setConcurrency(5) + .transform(df2) + .cache() +) + +display(analyzed_df) +``` + +## 4 - Simplify form recognition output + +This code uses the [FormOntologyLearner](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.services.form.html#module-synapse.ml.services.form.FormOntologyTransformer), a transformer that analyzes the output of Form Recognizer transformers (for Azure AI Document Intelligence) and infers a tabular data structure. The output of AnalyzeInvoices is dynamic and varies based on the features detected in your content. + +FormOntologyLearner extends the utility of the AnalyzeInvoices transformer by looking for patterns that can be used to create a tabular data structure. Organizing the output into multiple columns and rows makes for simpler downstream analysis. + + +```python +from synapse.ml.services.form import FormOntologyLearner + +organized_df = ( + FormOntologyLearner() + .setInputCol("invoices") + .setOutputCol("extracted") + .fit(analyzed_df) + .transform(analyzed_df) + .select("url", "extracted.*") + .cache() +) + +display(organized_df) +``` + +With our nice tabular dataframe, we can flatten the nested tables found in the forms with some SparkSQL + + +```python +from pyspark.sql.functions import explode, col + +itemized_df = ( + organized_df.select("*", explode(col("Items")).alias("Item")) + .drop("Items") + .select("Item.*", "*") + .drop("Item") +) + +display(itemized_df) +``` + +## 5 - Add translations + +This code loads [Translate](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/AI%20Services/Overview/#translation), a transformer that calls the Azure AI Translator service in Azure AI services. The original text, which is in English in the "Description" column, is machine-translated into various languages. All of the output is consolidated into "output.translations" array. + + +```python +from synapse.ml.services.translate import Translate + +translated_df = ( + Translate() + .setSubscriptionKey(translator_key) + .setLocation(translator_location) + .setTextCol("Description") + .setErrorCol("TranslationError") + .setOutputCol("output") + .setToLanguage(["zh-Hans", "fr", "ru", "cy"]) + .setConcurrency(5) + .transform(itemized_df) + .withColumn("Translations", col("output.translations")[0]) + .drop("output", "TranslationError") + .cache() +) + +display(translated_df) +``` + +## 6 - Translate products to emojis with OpenAI 🤯 + + +```python +from synapse.ml.services.openai import OpenAIPrompt +from pyspark.sql.functions import trim, split + +emoji_template = """ + Your job is to translate item names into emoji. Do not add anything but the emoji and end the translation with a comma + + Two Ducks: 🦆🦆, + Light Bulb: 💡, + Three Peaches: 🍑🍑🍑, + Two kitchen stoves: ♨️♨️, + A red car: 🚗, + A person and a cat: 🧍🐈, + A {Description}: """ + +prompter = ( + OpenAIPrompt() + .setSubscriptionKey(openai_key) + .setDeploymentName(openai_deployment_name) + .setUrl(openai_url) + .setMaxTokens(5) + .setPromptTemplate(emoji_template) + .setErrorCol("error") + .setOutputCol("Emoji") +) + +emoji_df = ( + prompter.transform(translated_df) + .withColumn("Emoji", trim(split(col("Emoji"), ",").getItem(0))) + .drop("error", "prompt") + .cache() +) +``` + + +```python +display(emoji_df.select("Description", "Emoji")) +``` + +## 7 - Infer vendor address continent with OpenAI + + +```python +continent_template = """ +Which continent does the following address belong to? + +Pick one value from Europe, Australia, North America, South America, Asia, Africa, Antarctica. + +Dont respond with anything but one of the above. If you don't know the answer or cannot figure it out from the text, return None. End your answer with a comma. + +Address: "6693 Ryan Rd, North Whales", +Continent: Europe, +Address: "6693 Ryan Rd", +Continent: None, +Address: "{VendorAddress}", +Continent:""" + +continent_df = ( + prompter.setOutputCol("Continent") + .setPromptTemplate(continent_template) + .transform(emoji_df) + .withColumn("Continent", trim(split(col("Continent"), ",").getItem(0))) + .drop("error", "prompt") + .cache() +) +``` + + +```python +display(continent_df.select("VendorAddress", "Continent")) +``` + +## 8 - Create an Azure Search Index for the Forms + + +```python +from synapse.ml.services import * +from pyspark.sql.functions import monotonically_increasing_id, lit + +( + continent_df.withColumn("DocID", monotonically_increasing_id().cast("string")) + .withColumn("SearchAction", lit("upload")) + .writeToAzureSearch( + subscriptionKey=search_key, + actionCol="SearchAction", + serviceName=search_service, + indexName=search_index, + keyCol="DocID", + ) +) +``` + +## 9 - Try out a search query + + +```python +import requests + +search_url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format( + search_service, search_index +) +requests.post( + search_url, json={"search": "door"}, headers={"api-key": search_key} +).json() +``` + +## 10 - Build a chatbot that can use Azure Search as a tool 🧠🔧 + + + + +```python +import json +import openai + +openai.api_type = "azure" +openai.api_base = openai_url +openai.api_key = openai_key +openai.api_version = "2023-03-15-preview" + +chat_context_prompt = f""" +You are a chatbot designed to answer questions with the help of a search engine that has the following information: + +{continent_df.columns} + +If you dont know the answer to a question say "I dont know". Do not lie or hallucinate information. Be brief. If you need to use the search engine to solve the please output a json in the form of {{"query": "example_query"}} +""" + + +def search_query_prompt(question): + return f""" +Given the search engine above, what would you search for to answer the following question? + +Question: "{question}" + +Please output a json in the form of {{"query": "example_query"}} +""" + + +def search_result_prompt(query): + search_results = requests.post( + search_url, json={"search": query}, headers={"api-key": search_key} + ).json() + return f""" + +You previously ran a search for "{query}" which returned the following results: + +{search_results} + +You should use the results to help you answer questions. If you dont know the answer to a question say "I dont know". Do not lie or hallucinate information. Be Brief and mention which query you used to solve the problem. +""" + + +def prompt_gpt(messages): + response = openai.ChatCompletion.create( + engine=openai_deployment_name, messages=messages, max_tokens=None, top_p=0.95 + ) + return response["choices"][0]["message"]["content"] + + +def custom_chatbot(question): + while True: + try: + query = json.loads( + prompt_gpt( + [ + {"role": "system", "content": chat_context_prompt}, + {"role": "user", "content": search_query_prompt(question)}, + ] + ) + )["query"] + + return prompt_gpt( + [ + {"role": "system", "content": chat_context_prompt}, + {"role": "system", "content": search_result_prompt(query)}, + {"role": "user", "content": question}, + ] + ) + except Exception as e: + raise e +``` + +## 11 - Asking our chatbot a question + + +```python +custom_chatbot("What did Luke Diaz buy?") +``` + +## 12 - A quick double check + + +```python +display( + continent_df.where(col("CustomerName") == "Luke Diaz") + .select("Description") + .distinct() +) +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenCV/Image Transformations.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenCV/Image Transformations.md new file mode 100644 index 0000000000..5421d182d0 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/OpenCV/Image Transformations.md @@ -0,0 +1,119 @@ +--- +title: Image Transformations +hide_title: true +status: stable +--- +## OpenCV - Pipeline Image Transformations + +This example shows how to manipulate the collection of images. +First, the images are downloaded to the local directory. +Second, they are copied to your cluster's attached HDFS. + +The images are loaded from the directory (for fast prototyping, consider loading a fraction of +images). Inside the dataframe, each image is a single field in the image column. The image has +sub-fields (path, height, width, OpenCV type and OpenCV bytes). + + +```python +from synapse.ml.opencv import toNDArray +from synapse.ml.io import * + +imageDir = "wasbs://publicwasb@mmlspark.blob.core.windows.net/sampleImages" +images = spark.read.image().load(imageDir).cache() +images.printSchema() +print(images.count()) +``` + +We can also alternatively stream the images with a similar api. +Check the [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) +for more details on streaming. + +When collected from the *DataFrame*, the image data are stored in a *Row*, which is Spark's way +to represent structures (in the current example, each dataframe row has a single Image, which +itself is a Row). It is possible to address image fields by name and use `toNDArray()` helper +function to convert the image into numpy array for further manipulations. + + +```python +from synapse.ml.core.platform import running_on_binder + +if running_on_binder(): + from IPython import get_ipython +from PIL import Image +import matplotlib.pyplot as plt + +data = images.take(3) # take first three rows of the dataframe +im = data[2][0] # the image is in the first column of a given row + +print("image type: {}, number of fields: {}".format(type(im), len(im))) +print("image path: {}".format(im.origin)) +print("height: {}, width: {}, OpenCV type: {}".format(im.height, im.width, im.mode)) + +arr = toNDArray(im) # convert to numpy array +print(images.count()) +plt.imshow(Image.fromarray(arr, "RGB")) # display the image inside notebook +``` + +Use `ImageTransformer` for the basic image manipulation: resizing, cropping, etc. +Internally, operations are pipelined and backed by OpenCV implementation. + + +```python +from synapse.ml.opencv import ImageTransformer + +tr = ( + ImageTransformer() # images are resized and then cropped + .setOutputCol("transformed") + .resize(size=(200, 200)) + .crop(0, 0, height=180, width=180) +) + +small = tr.transform(images).select("transformed") + +im = small.take(3)[2][0] # take third image +plt.imshow(Image.fromarray(toNDArray(im), "RGB")) # display the image inside notebook +``` + +For the advanced image manipulations, use Spark UDFs. +The SynapseML package provides conversion function between *Spark Row* and +*ndarray* image representations. + + +```python +from pyspark.sql.functions import udf +from synapse.ml.opencv import ImageSchema, toNDArray, toImage + + +def u(row): + array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3] + array[:, :, 2] = 0 + return toImage(array) # numpy array back to Spark Row structure + + +noBlueUDF = udf(u, ImageSchema) + +noblue = small.withColumn("noblue", noBlueUDF(small["transformed"])).select("noblue") + +im = noblue.take(3)[2][0] # take second image +plt.imshow(Image.fromarray(toNDArray(im), "RGB")) # display the image inside notebook +``` + +Images could be unrolled into the dense 1D vectors suitable for CNTK evaluation. + + +```python +from synapse.ml.image import UnrollImage + +unroller = UnrollImage().setInputCol("noblue").setOutputCol("unrolled") + +unrolled = unroller.transform(noblue).select("unrolled") + +vector = unrolled.take(1)[0][0] +print(type(vector)) +len(vector.toArray()) +``` + + +```python + +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Cyber ML.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Cyber ML.md new file mode 100644 index 0000000000..fbe0144764 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Cyber ML.md @@ -0,0 +1,82 @@ +--- +title: CyberML +hide_title: true +sidebar_label: CyberML +--- + +# CyberML + +## access anomalies: [complement_access.py](https://github.com/microsoft/SynapseML/blob/master/core/src/main/python/synapse/ml/cyber/anomaly/complement_access.py) +- [Talk at European Spark Conference 2019](https://databricks.com/session_eu19/cybermltoolkit-anomaly-detection-as-a-scalable-generic-service-over-apache-spark) +- [(Internal Microsoft) Talk at MLADS November 2018](https://resnet.microsoft.com/video/42395) +- [(Internal Microsoft) Talk at MLADS June 2019](https://resnet.microsoft.com/video/43618) + +1. [ComplementAccessTransformer](https://github.com/microsoft/SynapseML/blob/master/core/src/main/python/synapse/ml/cyber/anomaly/complement_access.py) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe, it returns a new dataframe comprised of access patterns sampled from + the set of possible access patterns not present in the original dataframe. + In other words, it returns a sample from the complement set. + +## feature engineering: [indexers.py](https://github.com/microsoft/SynapseML/blob/master/core/src/main/python/synapse/ml/cyber/feature/indexers.py) +1. [IdIndexer](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.indexers.IdIndexer) + is a SparkML [Estimator](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Estimator.html). + Given a dataframe, it creates an IdIndexerModel (described next) for categorical features. The model + maps each partition and column seen in the given dataframe to an ID, + for each partition or one consecutive range for all partition and column values. +2. [IdIndexerModel](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.indexers.IdIndexerModel) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe maps each partition and column field to a consecutive integer ID. + Partitions or column values not encountered in the estimator are mapped to 0. + The model can operate in two modes, either create consecutive integer ID independently +3. [MultiIndexer](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.indexers.MultiIndexer) + is a SparkML [Estimator](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Estimator.html). + Uses multiple IdIndexers to generate a MultiIndexerModel (described next) for categorical features. The model + contains multiple IdIndexers for multiple partitions and columns. +4. [MultiIndexerModel](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.indexers.MultiIndexerModel) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe maps each partition and column field to a consecutive integer ID. + Partitions or column values not encountered in the estimator are mapped to 0. + The model can operate in two modes, either create consecutive integer ID independently + +## feature engineering: [scalers.py](https://github.com/microsoft/SynapseML/blob/master/core/src/main/python/synapse/ml/cyber/feature/scalers.py) +1. [StandardScalarScaler](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.scalers.StandardScalarScaler) + is a SparkML [Estimator](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Estimator.html). + Given a dataframe it creates a StandardScalarScalerModel (described next) which normalizes + any given dataframe according to the mean and standard deviation calculated on the + dataframe given to the estimator. +2. [StandardScalarScalerModel](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.scalers.StandardScalarScalerModel) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe with a value column x, the transformer changes its value as follows: + x'=(x-mean)/stddev. That is, if the transformer is given the same dataframe the estimator + was given then the value column will have a mean of 0.0 and a standard deviation of 1.0. +3. [LinearScalarScaler](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.scalers.LinearScalarScaler) + is a SparkML [Estimator](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Estimator.html). + Given a dataframe it creates a LinearScalarScalerModel (described next) which normalizes + any given dataframe according to the minimum and maximum values calculated on the + dataframe given to the estimator. +4. [LinearScalarScalerModel](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.feature.html#synapse.ml.cyber.feature.scalers.LinearScalarScalerModel) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe with a value column x, the transformer changes its value such that + if the transformer is given the same dataframe the estimator + was given then the value column will be scaled linearly to the given ranges. + +## access anomalies: [collaborative_filtering.py](https://github.com/microsoft/SynapseML/blob/master/core/src/main/python/synapse/ml/cyber/anomaly/collaborative_filtering.py) +1. [AccessAnomaly](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.anomaly.html#synapse.ml.cyber.anomaly.collaborative_filtering.AccessAnomaly) + is a SparkML [Estimator](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Estimator.html). + Given a dataframe, the estimator generates an AccessAnomalyModel (described next). The model + can detect anomalous access of users to resources where the access + is outside of the user's or resources's profile. For instance, a user from HR accessing + a resource from Finance. This result is based solely on access patterns rather than explicit features. + Internally, the code is based on Collaborative Filtering as implemented in Spark, using + Matrix Factorization with Alternating Least Squares. +2. [AccessAnomalyModel](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.anomaly.html#synapse.ml.cyber.anomaly.collaborative_filtering.AccessAnomalyModel) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + Given a dataframe the transformer computes a value between (-inf, inf) where positive + values indicate an anomaly score. Anomaly scores are computed to have a mean of 1.0 + and a standard deviation of 1.0 over the original dataframe given to the estimator. +3. [ModelNormalizeTransformer](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.anomaly.html#synapse.ml.cyber.anomaly.collaborative_filtering.ModelNormalizeTransformer) + is a SparkML [Transformer](https://spark.apache.org/docs/2.2.0/api/java/index.html?org/apache/spark/ml/Transformer.html). + This transformer is used internally by AccessAnomaly to normalize a model to generate + anomaly scores with mean 0.0 and standard deviation of 1.0. +4. [AccessAnomalyConfig](https://mmlspark.blob.core.windows.net/docs/1.0.7/pyspark/synapse.ml.cyber.anomaly.html#synapse.ml.cyber.anomaly.collaborative_filtering.AccessAnomalyConfig) + contains the default values for AccessAnomaly. diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.md new file mode 100644 index 0000000000..8fa5224bdb --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Anomalous Access Detection.md @@ -0,0 +1,348 @@ +--- +title: Quickstart - Anomalous Access Detection +hide_title: true +status: stable +--- +# CyberML - Anomalous Access Detection + +Here we demonstrate a novel CyberML model which can learn user access patterns and then automatically detect anomalous user access based on learned behavior. +The model internally uses Collaborative Filtering for Implicit Feedback as published here: http://yifanhu.net/PUB/cf.pdf +and is based on Apache Spark's implementation of this: https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html. + +This notebook demonstrates a usage example of Anomalous Resource Access model. +All the model requires is a dataset in which there are 'users' which access 'resources'. +The model is based on Collaborative Filtering and it uses Machine Learning to learn access patterns of users and resources. +When a user accesses a resource which is outside of the user's learned profile then this access receives a high anomaly score. + +In this notebook we provide a usage example and a synthetic dataset in which there are 3 departments: +(1) Finance, (2) HR and (3) Engineering. +In the training data users access only a subset of resources from their own departments. +To evaluate the model we use two datasets. +The first contains access patterns unseen during training in which users access resources within their departments (again, resources they didn't access during training but within their department). +The latter contains users accessing resources from outside their department. +We then use the model to assign anomaly scores expecting that the first get low anomaly scores and the latter receive high anomaly scores. +This is what this example demonstrates. + +Note: the data does NOT contain information about departments, this information is implicitly learned by the model by analyzing the access patterns. + +# Create an Azure Databricks cluster and install the following libs + +1. In Cluster Libraries install from library source Maven: +Coordinates: com.microsoft.azure:synapseml_2.12:1.0.7 +Repository: https://mmlspark.azureedge.net/maven + +2. In Cluster Libraries install from PyPI the library called plotly + +# Setup & Initialization + + +```python +%pip install plotly +``` + + +```python +# this is used to produce the synthetic dataset for this test +from synapse.ml.cyber.dataset import DataFactory +from synapse.ml.cyber.anomaly.collaborative_filtering import AccessAnomaly +from pyspark.sql import functions as f +``` + +# Load up datasets + + +```python +from synapse.ml.core.platform import running_on_databricks, running_on_synapse + +if running_on_databricks(): + spark.sparkContext.setCheckpointDir("dbfs:/checkpoint_path/") +else: + spark.sparkContext.setCheckpointDir("./tmp/checkpoint_path/") + +factory = DataFactory( + num_hr_users=25, + num_hr_resources=50, + num_fin_users=35, + num_fin_resources=75, + num_eng_users=15, + num_eng_resources=25, + single_component=True, +) + +training_pdf = factory.create_clustered_training_data(ratio=0.4) + +# a tenant id is used when independent datasets originate from different tenants, in this example we set all tenants-ids to the same value +training_df = spark.createDataFrame(training_pdf).withColumn("tenant_id", f.lit(0)) +ingroup_df = spark.createDataFrame( + factory.create_clustered_intra_test_data(training_pdf) +).withColumn("tenant_id", f.lit(0)) +outgroup_df = spark.createDataFrame( + factory.create_clustered_inter_test_data() +).withColumn("tenant_id", f.lit(0)) +``` + + +```python +training_df.show() +``` + + +```python +print(training_df.count()) +print(ingroup_df.count()) +print(outgroup_df.count()) +``` + +# Model setup & training + + +```python +access_anomaly = AccessAnomaly( + tenantCol="tenant_id", + userCol="user", + resCol="res", + likelihoodCol="likelihood", + maxIter=200, +) +``` + + +```python +model = access_anomaly.fit(training_df) +``` + +# Apply model & show result stats + + +```python +ingroup_scored_df = model.transform(ingroup_df) +``` + + +```python +ingroup_scored_df.agg( + f.min("anomaly_score").alias("min_anomaly_score"), + f.max("anomaly_score").alias("max_anomaly_score"), + f.mean("anomaly_score").alias("mean_anomaly_score"), + f.stddev("anomaly_score").alias("stddev_anomaly_score"), +).show() +``` + + +```python +outgroup_scored_df = model.transform(outgroup_df) +``` + + +```python +outgroup_scored_df.agg( + f.min("anomaly_score").alias("min_anomaly_score"), + f.max("anomaly_score").alias("max_anomaly_score"), + f.mean("anomaly_score").alias("mean_anomaly_score"), + f.stddev("anomaly_score").alias("stddev_anomaly_score"), +).show() +``` + +# Examine results + + +```python +# +# Select a subset of results to send to Log Analytics +# + +full_res_df = outgroup_scored_df.orderBy(f.desc("anomaly_score")).cache() + +from pyspark.sql.window import Window + +w = Window.partitionBy("tenant_id", "user", "res").orderBy(f.desc("anomaly_score")) + +# select values above threshold +results_above_threshold = full_res_df.filter(full_res_df.anomaly_score > 1.0) + +# get distinct resource/user and corresponding timestamp and highest score +results_to_la = ( + results_above_threshold.withColumn("index", f.row_number().over(w)) + .orderBy(f.desc("anomaly_score")) + .select("tenant_id", f.col("user"), f.col("res"), "anomaly_score") + .where("index == 1") + .limit(100) + .cache() +) + +# add a fake timestamp to the results +results_to_la = results_to_la.withColumn("timestamp", f.current_timestamp()) + +display(results_to_la) +``` + +# Display all resource accesses by users with highest anomalous score + + +```python +from plotly import __version__ +from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, offline + +import numpy as np +import pandas as pd + +print(__version__) # requires version >= 1.9.0 + +# run plotly in offline mode +offline.init_notebook_mode() +``` + + +```python +# Find all server accesses of users with high predicted scores +# For display, limit to top 25 results +results_to_display = results_to_la.orderBy(f.desc("anomaly_score")).limit(25).cache() +interesting_records = full_res_df.join(results_to_display, ["user"], "left_semi") +non_anomalous_records = interesting_records.join( + results_to_display, ["user", "res"], "left_anti" +) + +top_non_anomalous_records = ( + non_anomalous_records.groupBy("tenant_id", "user", "res") + .agg( + f.count("*").alias("count"), + ) + .select(f.col("tenant_id"), f.col("user"), f.col("res"), "count") +) + +# pick only a subset of non-anomalous record for UI +w = Window.partitionBy( + "tenant_id", + "user", +).orderBy(f.desc("count")) + +# pick top non-anomalous set +top_non_anomalous_accesses = ( + top_non_anomalous_records.withColumn("index", f.row_number().over(w)) + .orderBy(f.desc("count")) + .select("tenant_id", f.col("user"), f.col("res"), f.col("count")) + .where("index in (1,2,3,4,5)") + .limit(25) +) + +# add back anomalous record +fileShare_accesses = ( + top_non_anomalous_accesses.select("user", "res", "count") + .union(results_to_display.select("user", "res", f.lit(1).alias("count"))) + .cache() +) +``` + + +```python +# get unique users and file shares +high_scores_df = fileShare_accesses.toPandas() +unique_arr = np.append(high_scores_df.user.unique(), high_scores_df.res.unique()) + +unique_df = pd.DataFrame(data=unique_arr, columns=["name"]) +unique_df["index"] = range(0, len(unique_df.index)) + +# create index for source & target and color for the normal accesses +normal_line_color = "rgba(211, 211, 211, 0.8)" +anomolous_color = "red" +x = ( + pd.merge(high_scores_df, unique_df, how="left", left_on="user", right_on="name") + .drop(["name"], axis=1) + .rename(columns={"index": "userIndex"}) +) +all_access_index_df = ( + pd.merge(x, unique_df, how="left", left_on="res", right_on="name") + .drop(["name"], axis=1) + .rename(columns={"index": "resIndex"}) +) +all_access_index_df["color"] = normal_line_color + +# results_to_display index, color and +y = results_to_display.toPandas().drop( + ["tenant_id", "timestamp", "anomaly_score"], axis=1 +) +y = ( + pd.merge(y, unique_df, how="left", left_on="user", right_on="name") + .drop(["name"], axis=1) + .rename(columns={"index": "userIndex"}) +) +high_scores_index_df = ( + pd.merge(y, unique_df, how="left", left_on="res", right_on="name") + .drop(["name"], axis=1) + .rename(columns={"index": "resIndex"}) +) +high_scores_index_df["count"] = 1 +high_scores_index_df["color"] = anomolous_color + +# subtract 1 for the red entries in all_access df +hsi_df = high_scores_index_df[["user", "res", "count"]].rename( + columns={"count": "hsiCount"} +) +all_access_updated_count_df = pd.merge( + all_access_index_df, + hsi_df, + how="left", + left_on=["user", "res"], + right_on=["user", "res"], +) +all_access_updated_count_df["count"] = np.where( + all_access_updated_count_df["hsiCount"] == 1, + all_access_updated_count_df["count"] - 1, + all_access_updated_count_df["count"], +) +all_access_updated_count_df = all_access_updated_count_df.loc[ + all_access_updated_count_df["count"] > 0 +] +all_access_updated_count_df = all_access_updated_count_df[ + ["user", "res", "count", "userIndex", "resIndex", "color"] +] + +# combine the two tables +frames = [all_access_updated_count_df, high_scores_index_df] +display_df = pd.concat(frames, sort=True) +# display_df.head() +``` + + +```python +data_trace = dict( + type="sankey", + domain=dict(x=[0, 1], y=[0, 1]), + orientation="h", + valueformat=".0f", + node=dict( + pad=10, + thickness=30, + line=dict(color="black", width=0), + label=unique_df["name"].dropna(axis=0, how="any"), + ), + link=dict( + source=display_df["userIndex"].dropna(axis=0, how="any"), + target=display_df["resIndex"].dropna(axis=0, how="any"), + value=display_df["count"].dropna(axis=0, how="any"), + color=display_df["color"].dropna(axis=0, how="any"), + ), +) + +layout = dict( + title="All resources accessed by users with highest anomalous scores", + height=772, + font=dict(size=10), +) + +fig = dict(data=[data_trace], layout=layout) + +p = plot(fig, output_type="div") + +if running_on_databricks(): + displayHTML(p) +else: + import IPython + + IPython.display.HTML(p) +``` + + +```python + +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.md new file mode 100644 index 0000000000..69f76dbed1 --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Quickstart - Exploring Art Across Cultures.md @@ -0,0 +1,221 @@ +--- +title: Quickstart - Exploring Art Across Cultures +hide_title: true +status: stable +--- +# Exploring Art across Culture and Medium with Fast, Conditional, k-Nearest Neighbors + +This article serves as a guideline for match-finding via k-nearest-neighbors. You set up code that allows queries involving cultures and mediums of art amassed from the Metropolitan Museum of Art in NYC and the Rijksmuseum in Amsterdam. + + + +## Overview of the BallTree +The structure functioning behind the KNN model is a BallTree, which is a recursive binary tree where each node (or "ball") contains a partition of the points of data to be queried. Building a BallTree involves assigning data points to the "ball" whose center they're closest to (with respect to a certain specified feature), resulting in a structure that allows binary-tree-like traversal and lends itself to finding k-nearest neighbors at a BallTree leaf. + +## Setup +Import necessary Python libraries and prepare dataset. + + +```python +from pyspark.sql.types import BooleanType +from pyspark.sql.types import * +from pyspark.sql.functions import lit, array, array_contains, udf, col, struct +from synapse.ml.nn import ConditionalKNN, ConditionalKNNModel +from PIL import Image +from io import BytesIO +import requests +import numpy as np +import matplotlib.pyplot as plt +``` + +Our dataset comes from a table containing artwork information from both the Met and Rijks museums. The schema is as follows: + +- **id**: A unique identifier for a piece of art + - Sample Met id: *388395* + - Sample Rijks id: *SK-A-2344* +- **Title**: Art piece title, as written in the museum's database +- **Artist**: Art piece artist, as written in the museum's database +- **Thumbnail_Url**: Location of a JPEG thumbnail of the art piece +- **Image_Url** Location of an image of the art piece hosted on the Met/Rijks website +- **Culture**: Category of culture that the art piece falls under + - Sample culture categories: *latin american*, *egyptian*, etc. +- **Classification**: Category of medium that the art piece falls under + - Sample medium categories: *woodwork*, *paintings*, etc. +- **Museum_Page**: Link to the work of art on the Met/Rijks website +- **Norm_Features**: Embedding of the art piece image +- **Museum**: Specifies which museum the piece originated from + + +```python +# loads the dataset and the two trained CKNN models for querying by medium and culture +df = spark.read.parquet( + "wasbs://publicwasb@mmlspark.blob.core.windows.net/met_and_rijks.parquet" +) +display(df.drop("Norm_Features")) +``` + +## Define categories to be queried on +Two KNN models are used: one for culture, and one for medium. + + +```python +# mediums = ['prints', 'drawings', 'ceramics', 'textiles', 'paintings', "musical instruments","glass", 'accessories', 'photographs', "metalwork", +# "sculptures", "weapons", "stone", "precious", "paper", "woodwork", "leatherwork", "uncategorized"] + +mediums = ["paintings", "glass", "ceramics"] + +# cultures = ['african (general)', 'american', 'ancient american', 'ancient asian', 'ancient european', 'ancient middle-eastern', 'asian (general)', +# 'austrian', 'belgian', 'british', 'chinese', 'czech', 'dutch', 'egyptian']#, 'european (general)', 'french', 'german', 'greek', +# 'iranian', 'italian', 'japanese', 'latin american', 'middle eastern', 'roman', 'russian', 'south asian', 'southeast asian', +# 'spanish', 'swiss', 'various'] + +cultures = ["japanese", "american", "african (general)"] + +# Uncomment the above for more robust and large scale searches! + +classes = cultures + mediums + +medium_set = set(mediums) +culture_set = set(cultures) +selected_ids = {"AK-RBK-17525-2", "AK-MAK-1204", "AK-RAK-2015-2-9"} + +small_df = df.where( + udf( + lambda medium, culture, id_val: (medium in medium_set) + or (culture in culture_set) + or (id_val in selected_ids), + BooleanType(), + )("Classification", "Culture", "id") +) + +small_df.count() +``` + +## Define and fit ConditionalKNN models +Create ConditionalKNN models for both the medium and culture columns; each model takes in an output column, features column (feature vector), values column (cell values under the output column), and label column (the quality that the respective KNN is conditioned on). + + +```python +medium_cknn = ( + ConditionalKNN() + .setOutputCol("Matches") + .setFeaturesCol("Norm_Features") + .setValuesCol("Thumbnail_Url") + .setLabelCol("Classification") + .fit(small_df) +) +``` + + +```python +culture_cknn = ( + ConditionalKNN() + .setOutputCol("Matches") + .setFeaturesCol("Norm_Features") + .setValuesCol("Thumbnail_Url") + .setLabelCol("Culture") + .fit(small_df) +) +``` + +## Define matching and visualizing methods + +After the initial dataset and category setup, prepare methods that will query and visualize the conditional KNN's results. + +`addMatches()` creates a Dataframe with a handful of matches per category. + + +```python +def add_matches(classes, cknn, df): + results = df + for label in classes: + results = cknn.transform( + results.withColumn("conditioner", array(lit(label))) + ).withColumnRenamed("Matches", "Matches_{}".format(label)) + return results +``` + +`plot_urls()` calls `plot_img` to visualize top matches for each category into a grid. + + +```python +def plot_img(axis, url, title): + try: + response = requests.get(url) + img = Image.open(BytesIO(response.content)).convert("RGB") + axis.imshow(img, aspect="equal") + except: + pass + if title is not None: + axis.set_title(title, fontsize=4) + axis.axis("off") + + +def plot_urls(url_arr, titles, filename): + nx, ny = url_arr.shape + + plt.figure(figsize=(nx * 5, ny * 5), dpi=1600) + fig, axes = plt.subplots(ny, nx) + + # reshape required in the case of 1 image query + if len(axes.shape) == 1: + axes = axes.reshape(1, -1) + + for i in range(nx): + for j in range(ny): + if j == 0: + plot_img(axes[j, i], url_arr[i, j], titles[i]) + else: + plot_img(axes[j, i], url_arr[i, j], None) + + plt.savefig(filename, dpi=1600) # saves the results as a PNG + + display(plt.show()) +``` + +## Putting it all together +Define `test_all()` to take in the data, CKNN models, the art id values to query on, and the file path to save the output visualization to. The medium and culture models were previously trained and loaded. + + +```python +# main method to test a particular dataset with two CKNN models and a set of art IDs, saving the result to filename.png + + +def test_all(data, cknn_medium, cknn_culture, test_ids, root): + is_nice_obj = udf(lambda obj: obj in test_ids, BooleanType()) + test_df = data.where(is_nice_obj("id")) + + results_df_medium = add_matches(mediums, cknn_medium, test_df) + results_df_culture = add_matches(cultures, cknn_culture, results_df_medium) + + results = results_df_culture.collect() + + original_urls = [row["Thumbnail_Url"] for row in results] + + culture_urls = [ + [row["Matches_{}".format(label)][0]["value"] for row in results] + for label in cultures + ] + culture_url_arr = np.array([original_urls] + culture_urls)[:, :] + plot_urls(culture_url_arr, ["Original"] + cultures, root + "matches_by_culture.png") + + medium_urls = [ + [row["Matches_{}".format(label)][0]["value"] for row in results] + for label in mediums + ] + medium_url_arr = np.array([original_urls] + medium_urls)[:, :] + plot_urls(medium_url_arr, ["Original"] + mediums, root + "matches_by_medium.png") + + return results_df_culture +``` + +## Demo +The following cell performs batched queries given desired image IDs and a filename to save the visualization. + + + + +```python +# sample query +result_df = test_all(small_df, medium_cknn, culture_cknn, selected_ids, root=".") +``` diff --git a/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md new file mode 100644 index 0000000000..ac67359dda --- /dev/null +++ b/website/versioned_docs/version-1.0.7/Explore Algorithms/Other Algorithms/Smart Adaptive Recommendations.md @@ -0,0 +1,183 @@ +--- +title: Smart Adaptive Recommendations (SAR) Algorithm +hide_title: true +sidebar_label: SAR Algorithm +--- + + +# Smart Adaptive Recommendations (SAR) Algorithm + +The following document is a subset of the implemented logic. The original can be found [here](https://github.com/Microsoft/Product-Recommendations/blob/master/doc/sar.md) + +**SAR** is a fast scalable adaptive algorithm for personalized recommendations based on user transactions history and items description. It produces easily explainable / interpretable recommendations. + +The overall architecture of SAR is shown in the following diagram: + +![SAR Diagram](https://i.imgur.com/AMPShWl.jpg) + +## Input + +The input to SAR consists of: + +- transaction (usage) data +- catalog data + +**Transaction data**, also called **usage data**, contains information on interactions between users and items and has the following schema: + +`,,