From 0d76e672cab3c4ea3b6ab3bcf4aee026faf9ef5f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 12 Jun 2024 11:52:01 -0700 Subject: [PATCH 01/83] Feat: Add custom embedder --- .../ml/dl/sentence_embedding_transformer.py | 498 +++++++++++ .../Quickstart - Custom Embeddins.ipynb | 781 ++++++++++++++++++ tools/init_scripts/init_retriever.sh | 56 ++ 3 files changed, 1335 insertions(+) create mode 100644 deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py create mode 100644 docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb create mode 100644 tools/init_scripts/init_retriever.sh diff --git a/deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py b/deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py new file mode 100644 index 0000000000..6b9743a695 --- /dev/null +++ b/deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py @@ -0,0 +1,498 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +class SuppressLogging: + def __init__(self): + self._original_stderr = None + + def start(self): + """Start suppressing logging by redirecting sys.stderr to /dev/null.""" + if self._original_stderr is None: + self._original_stderr = sys.stderr + sys.stderr = open('/dev/null', 'w') + + def stop(self): + """Stop suppressing logging and restore sys.stderr.""" + if self._original_stderr is not None: + sys.stderr.close() + sys.stderr = self._original_stderr + self._original_stderr = None + + +# Import necessary libraries +import numpy as np +import torch +import pyspark.sql.functions as F +import tensorrt as trt +import logging +import warnings +import sys +import datetime +import pytz +from tqdm import tqdm, trange +from numpy import ndarray +from torch import Tensor +from typing import List, Union + +import model_navigator as nav +from sentence_transformers import SentenceTransformer +from sentence_transformers.util import batch_to_device +from pyspark.ml.functions import predict_batch_udf +from faker import Faker + +from pyspark.ml import Transformer +from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params +from pyspark.sql.functions import col, struct, rand +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + ArrayType, + FloatType, +) + +class EmbeddingTransformer(Transformer, HasInputCol, HasOutputCol): + """ + Custom transformer that extends PySpark's Transformer class to + perform sentence embedding using a model with optional TensorRT acceleration. + """ + + # Define additional parameters + useTRT = Param(Params._dummy(), "useTRT", "True if use TRT acceleration") + driverOnly = Param( + Params._dummy(), + "driverOnly", + "True if run encode only on Driver for small queries", + ) + batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) + modelName = Param(Params._dummy(), "modelName", "Model Name parameter") + moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") + model = Param(Params._dummy(), "model", "Model used for embedding") + path = Param(Params._dummy(), "path", "Path to .csv file with data") + + class SentenceTransformerNavigator(SentenceTransformer): + """ + Inner class extending SentenceTransformer to override the encode method + with additional functionality and optimizations (mainly to eliminate RecursiveErrors). + """ + + def encode( + self, + sentences: Union[str, List[str]], + batch_size: int = 64, + sentence_length: int = 512, + show_progress_bar: bool = False, + output_value: str = "sentence_embedding", + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + device: str = None, + normalize_embeddings: bool = False, + ) -> Union[List[Tensor], ndarray, Tensor]: + """ + Encode sentences into embeddings with optional configurations. + """ + self.eval() + show_progress_bar = ( + show_progress_bar if show_progress_bar is not None else True + ) + convert_to_numpy = convert_to_numpy and not convert_to_tensor + output_value = output_value or "sentence_embedding" + + # Handle input as a list of sentences + input_was_string = isinstance(sentences, str) or not hasattr( + sentences, "__len__" + ) + if input_was_string: + sentences = [sentences] + + # Determine the device to use for computation + device = device or self._target_device + self.to(device) + + # Initialize list for embeddings + all_embeddings = [] + length_sorted_idx = np.argsort( + [-self._text_length(sen) for sen in sentences] + ) + sentences_sorted = [sentences[idx] for idx in length_sorted_idx] + + # Process sentences in batches + for start_index in trange( + 0, + len(sentences), + batch_size, + desc="Batches", + disable=not show_progress_bar, + ): + sentences_batch = sentences_sorted[ + start_index : start_index + batch_size + ] + features = self.tokenize(sentences_batch) + features = batch_to_device(features, device) + + # Perform forward pass and gather embeddings + with torch.no_grad(): + out_features = self(features) + + if output_value == "token_embeddings": + embeddings = [] + for token_emb, attention in zip( + out_features[output_value], out_features["attention_mask"] + ): + last_mask_id = len(attention) - 1 + while ( + last_mask_id > 0 and attention[last_mask_id].item() == 0 + ): + last_mask_id -= 1 + embeddings.append(token_emb[0 : last_mask_id + 1]) + elif output_value is None: + embeddings = [] + for sent_idx in range(len(out_features["sentence_embedding"])): + row = { + name: out_features[name][sent_idx] + for name in out_features + } + embeddings.append(row) + else: + embeddings = out_features[output_value] + embeddings = embeddings.detach() + if normalize_embeddings: + embeddings = torch.nn.functional.normalize( + embeddings, p=2, dim=1 + ) + if convert_to_numpy: + embeddings = embeddings.cpu() + + all_embeddings.extend(embeddings) + + # Restore original order of sentences + all_embeddings = [ + all_embeddings[idx] for idx in np.argsort(length_sorted_idx) + ] + if convert_to_tensor: + all_embeddings = torch.stack(all_embeddings) + elif convert_to_numpy: + all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) + + if input_was_string: + all_embeddings = all_embeddings[0] + + return all_embeddings + + def __init__( + self, + inputCol=None, + outputCol=None, + useTRTFlag=None, + driverOnly=False, + batchSize=16, + model=None, + modelName="intfloat/e5-large-v2", + moduleName="e5-large-v2", + ): + """ + Initialize the EmbeddingTransformer with input/output columns and optional TRT flag. + """ + super(EmbeddingTransformer, self).__init__() + self._setDefault( + inputCol="combined", + outputCol="embeddings", + useTRT=True, + driverOnly=False, + modelName=modelName, + moduleName=moduleName, + model=None, + batchSize=16, + ) + self._set( + inputCol=inputCol, + outputCol=outputCol, + useTRT=useTRTFlag, + driverOnly=driverOnly, + batchSize=batchSize, + model=model, + modelName=modelName, + moduleName=moduleName, + ) + + # Setter method for batchSize + def setBatchSize(self, value): + self._set(batchSize=value) + return self + + # Getter method for batchSize + def getBatchSize(self): + return self.getOrDefault(self.batchSize) + + # Setter method for useTRT + def setUseTRT(self, value): + self._set(useTRT=value) + return self + + # Getter method for useTRT + def getUseTRT(self): + return self.getOrDefault(self.useTRT) + + # Setter method for driverOnly + def setDriverOnly(self, value): + self._set(driverOnly=value) + return self + + # Getter method for driverOnly + def getDriverOnly(self): + return self.getOrDefault(self.driverOnly) + + # Setter method for model + def setModel(self, value): + self._paramMap[self.model] = value + return self + + # Getter method for model + def getModel(self): + return self.getOrDefault(self.model) + + # Setter method for modelName + def setModelName(self, value): + self._set(modelName=value) + return self + + # Getter method for modelName + def getModelName(self): + return self.getOrDefault(self.modelName) + + # Setter method for moduleName + def setModuleName(self, value): + self._set(moduleName=value) + return self + + # Getter method for moduleName + def getModuleName(self): + return self.getOrDefault(self.moduleName) + + def _optimize(self, model): + """ + Optimize the model using Model Navigator with TensorRT configuration. + """ + conf = nav.OptimizeConfig( + target_formats=(nav.Format.TENSORRT,), + runners=("TensorRT",), + optimization_profile=nav.OptimizationProfile(max_batch_size=64), + custom_configs=[ + nav.TorchConfig(autocast=True), + nav.TorchScriptConfig(autocast=True), + nav.TensorRTConfig( + precision=(nav.TensorRTPrecision.FP16,), + onnx_parser_flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM.value], + ), + ], + ) + + def gen_size_chunk(): + """ + Generate chunks of different batch sizes and sentence lengths. + """ + for batch_size in [64, 32, 16, 8, 4, 2, 1]: + for sentence_length in [20, 300, 512]: + yield (batch_size, sentence_length) + + def get_dataloader(repeat_times: int = 2): + """ + Create a data loader with synthetic data using Faker. + """ + faker = Faker() + i = 0 + for batch_size, chunk_size in gen_size_chunk(): + for _ in range(repeat_times): + yield ( + i, + ( + [ + " ".join(faker.words(chunk_size)) + for _ in range(batch_size) + ], + {"show_progress_bar": False}, + ), + ) + i += 1 + + total_batches = len(list(gen_size_chunk())) + func = lambda x, **kwargs: self.SentenceTransformerNavigator.encode( + model, x, **kwargs + ) + nav.optimize( + func, dataloader=tqdm(get_dataloader(), total=total_batches), config=conf + ) + + def run_on_driver(self, queries, spark): + """ + Method to run on the driver to generate embeddings and create a DataFrame. + """ + if spark is None: + raise ValueError("Spark context should be set") + + # Load the model on the driver + model = SentenceTransformer(self.getModelName()).eval() + + # Generate embeddings + with torch.no_grad(): + embeddings = [embedding.tolist() for embedding in model.encode(queries)] + + # Prepare data including IDs + data_with_ids = [ + (i + 1, query, embeddings[i]) for i, query in enumerate(queries) + ] + + # Define the schema for the DataFrame + schema = StructType( + [ + StructField("id", IntegerType(), nullable=False), + StructField("query", StringType(), nullable=False), + StructField( + "embeddings", + ArrayType(FloatType(), containsNull=False), + nullable=False, + ), + ] + ) + + # Create a DataFrame using the data with IDs and the schema + query_embeddings = spark.createDataFrame( + data=data_with_ids, schema=schema + ).cache() + + return query_embeddings + + def _predict_batch_fn(self): + """ + Create and return a function for batch prediction. + """ + if "model" not in globals(): + global model + if self.useTRT: + model = self.SentenceTransformerNavigator(self.getModelName()).eval() + model = nav.Module(model, name=self.getModuleName()) + try: + nav.load_optimized() + except Exception: + self._optimize(model) + nav.load_optimized() + print("create trt model") + else: + model = SentenceTransformer(self.getModelName()).eval() + print("create ST model") + + def predict(inputs): + """ + Predict method to encode inputs using the model. + """ + with torch.no_grad(): + output = model.encode( + inputs.tolist(), convert_to_tensor=False, show_progress_bar=False + ) + + return output + + return predict + + # Method to apply the transformation to the dataset + def _transform(self, dataset, spark): + """ + Apply the transformation to the input dataset. + """ + driverOnly = self.getDriverOnly() + + if driverOnly is None or driverOnly is False: + input_col = self.getInputCol() + output_col = self.getOutputCol() + + encode = predict_batch_udf( + self._predict_batch_fn, + return_type=ArrayType(FloatType()), + batch_size=self.getBatchSize(), + ) + return dataset.withColumn(output_col, encode(input_col)) + else: + if spark is None: + raise ValueError("Spark context should be set") + return self.run_on_driver(dataset, spark=spark) + + def transform(self, dataset, spark=None): + """ + Public method to transform the dataset. + """ + return self._transform(dataset, spark) + + def copy(self, extra=None): + """ + Create a copy of the transformer. + """ + return self._defaultCopy(extra) + + def load_data_food_reviews(self, spark, path=None, limit=1000): + """ + Load data from public dataset and generate 1M rows dataset from 1K data. + """ + if path is None: + path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv" + file_path = path + + # Check if the row count is less than 10 + if limit <= 0 or limit >= 1000000: + raise ValueError(f"Limit is {limit}, which should be less than 1M.") + + df = spark.read.options(inferSchema="True", delimiter=",", header=True).csv( + file_path + ) + df = df.withColumn( + "combined", + F.format_string( + "Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text) + ), + ) + + rowCnt = df.count() + + # Check the conditions + if limit > rowCnt and rowCnt > 1000: + + # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data) + cross_joined_df = df.crossJoin( + df.withColumnRenamed("combined", "combined_2") + ) + + # Create a new column 'result_vector' by concatenating the two source vectors + tmp_df = cross_joined_df.withColumn( + "result_vector", + F.concat(F.col("combined"), F.lit(". \n"), F.col("combined_2")), + ) + + # Select only the necessary columns and show the result + tmp_df = tmp_df.select("result_vector") + df = tmp_df.withColumnRenamed("result_vector", "combined").withColumn( + "id", monotonically_increasing_id() + ) + + # Shuffle the DataFrame with a fixed seed + seed = 42 + shuffled_df = df.orderBy(rand(seed)) + + return shuffled_df.limit(limit) + + +# Example usage: +# data = input data frame +# transformer = EmbeddingTransformer(inputCol="combined", outputCol="embeddings") +# result = transformer.transform(data) +# result.show() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb new file mode 100644 index 0000000000..ebd206f05d --- /dev/null +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb @@ -0,0 +1,781 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n", + "\n", + "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", + "\n", + "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Step 1: Prepare Environment\n", + "\n", + "It will imports required libraries and get initial settings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d188d8ee-8913-4170-8d35-8490f833ae95", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Start demo run with 1000 input rows\nCurrent time in PST: 2024-06-11 14:51:24 PDT-0700\n" + ] + } + ], + "source": [ + "\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, module='tritonclient.grpc')\n", + "import logging\n", + "logging.getLogger('py4j').setLevel(logging.ERROR)\n", + "import mlflow\n", + "import datetime\n", + "import pytz\n", + "from spark_rapids_ml.knn import ApproximateNearestNeighbors, ApproximateNearestNeighborsModel\n", + "from sentence_embedding_transformer import EmbeddingTransformer\n", + "\n", + "logging.getLogger('sentence_transformers.SentenceTransformer').setLevel(logging.ERROR)\n", + "mlflow.autolog(disable=True)\n", + "\n", + "# Define the PST timezone\n", + "pst_timezone = pytz.timezone('US/Pacific')\n", + "\n", + "# Get the current time in UTC and convert it to PST\n", + "current_start_time_utc = datetime.datetime.now(pytz.utc)\n", + "current_time_pst = current_start_time_utc.astimezone(pst_timezone)\n", + "\n", + "print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "22603456-4f44-4b3a-9751-9ca5231b799b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Step 2: Load Data\n", + "\n", + "In this demo we will explore a dataset of fine food reviews" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a0065427-a0d5-4867-8209-61969dc48082", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dataTransformer = EmbeddingTransformer(inputCol=\"combined\", outputCol=\"embeddings\", useTRTFlag=True, batchSize=16)\n", + "\n", + "# Load food revies with limiting number of rows until 1000000\n", + "df = dataTransformer.load_data_food_reviews(spark=spark, limit=1000).repartition(10).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Step 3: Generate Embeddings\n", + "\n", + "We will first generate embeddings using NVIDIA TensorRT optimized SentenceTransformer" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8d7bd9db-79a1-4d46-a849-ac49c3de7b49", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "all_embeddings = dataTransformer.transform(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6885033f-6eea-4338-a632-2837582d91a1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Step 4: Build the query against embeddings\n", + "\n", + "Get query embeddings running standard SentenceTransformer just on the driver. Convert embedding results to a data frame" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "23b83621-3f42-42ff-847e-97a4af2d3276", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "936b72a8576b4d95a93f9215dc0a998f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading .gitattributes: 0%| | 0.00/1.48k [00:00\n", + " .table-result-container {\n", + " max-height: 300px;\n", + " overflow: auto;\n", + " }\n", + " table, th, td {\n", + " border: 1px solid black;\n", + " border-collapse: collapse;\n", + " }\n", + " th, td {\n", + " padding: 5px;\n", + " }\n", + " th {\n", + " text-align: left;\n", + " }\n", + "
query_idindicesdistances
1List(737, 595, 308, 106, 591)List(0.6997133, 0.70115274, 0.7020032, 0.7047766, 0.7060983)
2List(58, 860, 194, 827, 614)List(0.6791535, 0.6887464, 0.702577, 0.70618147, 0.7084421)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + [ + 737, + 595, + 308, + 106, + 591 + ], + [ + 0.6997133, + 0.70115274, + 0.7020032, + 0.7047766, + 0.7060983 + ] + ], + [ + 2, + [ + 58, + 860, + 194, + 827, + 614 + ], + [ + 0.6791535, + 0.6887464, + 0.702577, + 0.70618147, + 0.7084421 + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "query_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "indices", + "type": "{\"type\":\"array\",\"elementType\":\"long\",\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "distances", + "type": "{\"type\":\"array\",\"elementType\":\"float\",\"containsNull\":true}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Demo finished\nCurrent time in PST: 2024-06-11 14:56:00 PDT-0700\nDifference: h: 0, min: 4, sec: 36\n" + ] + } + ], + "source": [ + "display(knn_df)\n", + "\n", + "print(f\"Demo finished\")\n", + "\n", + "# Get the current time in UTC and convert it to PST\n", + "current_end_time_utc = datetime.datetime.now(pytz.utc)\n", + "current_time_pst = current_end_time_utc.astimezone(pst_timezone)\n", + "\n", + "print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n", + "\n", + "dif = current_end_time_utc - current_start_time_utc\n", + "\n", + "# Extract hours, minutes, and seconds from the difference\n", + "total_seconds = int(dif.total_seconds())\n", + "hours, remainder = divmod(total_seconds, 3600)\n", + "minutes, seconds = divmod(remainder, 60)\n", + "\n", + "# Print the difference in the desired format\n", + "print(f\"Difference: h: {hours}, min: {minutes}, sec: {seconds}\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": -1, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2, + "widgetLayout": [] + }, + "notebookName": "Fast E5-TRT Embeddings with Rapids ANN", + "widgets": {} + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "save_output": true, + "synapse_widget": { + "state": { + "4bd0e60b-98ae-4bfe-98ee-6f0399ceb456": { + "persist_state": { + "view": { + "chartOptions": { + "aggregationType": "count", + "categoryFieldKeys": [ + "0" + ], + "chartType": "bar", + "isStacked": false, + "seriesFieldKeys": [ + "0" + ] + }, + "tableOptions": {}, + "type": "details" + } + }, + "sync_state": { + "isSummary": false, + "language": "scala", + "table": { + "rows": [ + { + "0": "Once upon a time", + "1": [ + " there was a girl who had a dream of becoming a writer.\n\nShe started writing short stories" + ] + }, + { + "0": "Hello my name is", + "1": [ + "***** and I have a question about my cat\n\nHello, thank you for bringing your question to" + ] + }, + { + "0": "The best code is code thats", + "1": [ + " not there\n\nCommenting your code is important. Not only does it help you remember what you" + ] + } + ], + "schema": [ + { + "key": "0", + "name": "prompt", + "type": "string" + }, + { + "key": "1", + "name": "text", + "type": "ArrayType(StringType,true)" + } + ], + "truncated": false + } + }, + "type": "Synapse.DataFrame" + } + }, + "version": "0.1" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tools/init_scripts/init_retriever.sh b/tools/init_scripts/init_retriever.sh new file mode 100644 index 0000000000..1204895eb1 --- /dev/null +++ b/tools/init_scripts/init_retriever.sh @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +echo "Starting init script execution..." + +RAPIDS_VERSION=23.10.0 +SPARK_RAPIDS_VERSION=23.10.0 +SPARK_RAPIDSML_VERSION=24.04 +NAVIGATOR_CACHE=/dbfs/FileStore/model_navigator.zip +TORCH_CACHE=/dbfs/FileStore/torch.zip + +# To eliminate optimization stage +# Unzip optimized TRT Navigator model and Transformer +# unzip ${NAVIGATOR_CACHE} -d /root/.cache +# unzip ${TORCH_CACHE} -d /root/.cache + +# install cudatoolkit 11.8 via runfile approach +wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run +sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit + +# reset symlink and update library loading paths +rm /usr/local/cuda +ln -s /usr/local/cuda-11.8 /usr/local/cuda + +# upgrade pip +/databricks/python/bin/pip install --upgrade pip + +# install cudf, cuml and their rapids dependencies +# using ~= pulls in latest micro version patches +/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} pylibraft-cu11~=${RAPIDS_VERSION} rmm-cu11~=${RAPIDS_VERSION} + +# install model navigator +/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com onnxruntime-gpu==1.16.3 "tensorrt==9.3.0.post12.dev1" "triton-model-navigator<1" "sentence_transformers~=2.2.2" "faker" "urllib3<2" + +# install spark-rapids-ml +/databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} + +# upgrade grpc +# /databricks/python/bin/pip uninstall -y grpcio grpcio-tools +# Install the specific version of grpcio +/databricks/python/bin/pip install grpcio==1.64.1 + +echo "Init script execution completed." \ No newline at end of file From 9afa431b105a260f9c5663cb749faa60b20226ea Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 13 Jun 2024 23:02:49 +0000 Subject: [PATCH 02/83] Corrected Names and file location --- ...rmer.py => HuggingFaceSentenceEmbedder.py} | 24 +- .../src/main/python/synapse/ml/__init__.py | 0 .../Quickstart - Custom Embeddins.ipynb | 340 +----------------- 3 files changed, 16 insertions(+), 348 deletions(-) rename deep-learning/src/main/python/synapse/ml/{dl/sentence_embedding_transformer.py => HuggingFaceSentenceEmbedder.py} (95%) create mode 100644 deep-learning/src/main/python/synapse/ml/__init__.py diff --git a/deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py similarity index 95% rename from deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py rename to deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 6b9743a695..d834ed3e6e 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/sentence_embedding_transformer.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -12,24 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -class SuppressLogging: - def __init__(self): - self._original_stderr = None - - def start(self): - """Start suppressing logging by redirecting sys.stderr to /dev/null.""" - if self._original_stderr is None: - self._original_stderr = sys.stderr - sys.stderr = open('/dev/null', 'w') - - def stop(self): - """Stop suppressing logging and restore sys.stderr.""" - if self._original_stderr is not None: - sys.stderr.close() - sys.stderr = self._original_stderr - self._original_stderr = None - - # Import necessary libraries import numpy as np import torch @@ -63,7 +45,7 @@ def stop(self): FloatType, ) -class EmbeddingTransformer(Transformer, HasInputCol, HasOutputCol): +class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ Custom transformer that extends PySpark's Transformer class to perform sentence embedding using a model with optional TensorRT acceleration. @@ -203,9 +185,9 @@ def __init__( moduleName="e5-large-v2", ): """ - Initialize the EmbeddingTransformer with input/output columns and optional TRT flag. + Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ - super(EmbeddingTransformer, self).__init__() + super(HuggingFaceSentenceEmbedder, self).__init__() self._setDefault( inputCol="combined", outputCol="embeddings", diff --git a/deep-learning/src/main/python/synapse/ml/__init__.py b/deep-learning/src/main/python/synapse/ml/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb index ebd206f05d..da7f816906 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb @@ -57,17 +57,14 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Start demo run with 1000 input rows\nCurrent time in PST: 2024-06-11 14:51:24 PDT-0700\n" - ] - } - ], + "outputs": [], "source": [ + "import sys\n", + "# Assuming 'deep-learning' is at the root of your Databricks Repo\n", + "repo_path = '/Workspace/Repos/aspiridonov@nvidia.com/SynapseML-db1/deep-learning/src/main/python'\n", + "if repo_path not in sys.path:\n", + " sys.path.append(repo_path)\n", + "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\", category=UserWarning, module='tritonclient.grpc')\n", @@ -77,7 +74,6 @@ "import datetime\n", "import pytz\n", "from spark_rapids_ml.knn import ApproximateNearestNeighbors, ApproximateNearestNeighborsModel\n", - "from sentence_embedding_transformer import EmbeddingTransformer\n", "\n", "logging.getLogger('sentence_transformers.SentenceTransformer').setLevel(logging.ERROR)\n", "mlflow.autolog(disable=True)\n", @@ -129,10 +125,10 @@ }, "outputs": [], "source": [ - "dataTransformer = EmbeddingTransformer(inputCol=\"combined\", outputCol=\"embeddings\", useTRTFlag=True, batchSize=16)\n", + "dataTransformer = HuggingFaceSentenceEmbedder(inputCol=\"combined\", outputCol=\"embeddings\", useTRTFlag=True, batchSize=16)\n", "\n", "# Load food revies with limiting number of rows until 1000000\n", - "df = dataTransformer.load_data_food_reviews(spark=spark, limit=1000).repartition(10).cache()" + "df = dataTransformer.load_data_food_reviews(spark=spark, limit=100).repartition(10).cache()" ] }, { @@ -210,203 +206,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "936b72a8576b4d95a93f9215dc0a998f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading .gitattributes: 0%| | 0.00/1.48k [00:00\n", - " .table-result-container {\n", - " max-height: 300px;\n", - " overflow: auto;\n", - " }\n", - " table, th, td {\n", - " border: 1px solid black;\n", - " border-collapse: collapse;\n", - " }\n", - " th, td {\n", - " padding: 5px;\n", - " }\n", - " th {\n", - " text-align: left;\n", - " }\n", - "
query_idindicesdistances
1List(737, 595, 308, 106, 591)List(0.6997133, 0.70115274, 0.7020032, 0.7047766, 0.7060983)
2List(58, 860, 194, 827, 614)List(0.6791535, 0.6887464, 0.702577, 0.70618147, 0.7084421)
" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "addedWidgets": {}, - "aggData": [], - "aggError": "", - "aggOverflow": false, - "aggSchema": [], - "aggSeriesLimitReached": false, - "aggType": "", - "arguments": {}, - "columnCustomDisplayInfos": {}, - "data": [ - [ - 1, - [ - 737, - 595, - 308, - 106, - 591 - ], - [ - 0.6997133, - 0.70115274, - 0.7020032, - 0.7047766, - 0.7060983 - ] - ], - [ - 2, - [ - 58, - 860, - 194, - 827, - 614 - ], - [ - 0.6791535, - 0.6887464, - 0.702577, - 0.70618147, - 0.7084421 - ] - ] - ], - "datasetInfos": [], - "dbfsResultPath": null, - "isJsonSchema": true, - "metadata": {}, - "overflow": false, - "plotOptions": { - "customPlotOptions": {}, - "displayType": "table", - "pivotAggregation": null, - "pivotColumns": null, - "xColumns": null, - "yColumns": null - }, - "removedWidgets": [], - "schema": [ - { - "metadata": "{}", - "name": "query_id", - "type": "\"long\"" - }, - { - "metadata": "{}", - "name": "indices", - "type": "{\"type\":\"array\",\"elementType\":\"long\",\"containsNull\":true}" - }, - { - "metadata": "{}", - "name": "distances", - "type": "{\"type\":\"array\",\"elementType\":\"float\",\"containsNull\":true}" - } - ], - "type": "table" - } - }, - "output_type": "display_data" - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Demo finished\nCurrent time in PST: 2024-06-11 14:56:00 PDT-0700\nDifference: h: 0, min: 4, sec: 36\n" - ] - } - ], + "outputs": [], "source": [ "display(knn_df)\n", "\n", @@ -687,7 +373,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Fast E5-TRT Embeddings with Rapids ANN", + "notebookName": "Quickstart - Custom Embeddins", "widgets": {} }, "kernel_info": { From ee8f6f921db29662da9d9f8c01ee581e7ed2328b Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 00:06:47 +0000 Subject: [PATCH 03/83] Code style corrections --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 261 ++++++++++-------- .../Quickstart - Custom Embeddins.ipynb | 65 +++++ 2 files changed, 210 insertions(+), 116 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index d834ed3e6e..b73f03eb71 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -52,19 +52,22 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ # Define additional parameters - useTRT = Param(Params._dummy(), "useTRT", "True if use TRT acceleration") - driverOnly = Param( - Params._dummy(), - "driverOnly", - "True if run encode only on Driver for small queries", - ) + # useTRT = Param(Params._dummy(), "useTRT", "True if use TRT acceleration") + + runtime = Param(Params._dummy(), "runtime", "Specifies the runtime environment: cpu, cuda, or tensorrt") + + # driverOnly = Param( + # Params._dummy(), + # "driverOnly", + # "True if run encode only on Driver for small queries", + # ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) modelName = Param(Params._dummy(), "modelName", "Model Name parameter") moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") model = Param(Params._dummy(), "model", "Model used for embedding") path = Param(Params._dummy(), "path", "Path to .csv file with data") - class SentenceTransformerNavigator(SentenceTransformer): + class _SentenceTransformerNavigator(SentenceTransformer): """ Inner class extending SentenceTransformer to override the encode method with additional functionality and optimizations (mainly to eliminate RecursiveErrors). @@ -177,36 +180,36 @@ def __init__( self, inputCol=None, outputCol=None, - useTRTFlag=None, - driverOnly=False, + runtime=None, + # driverOnly=False, batchSize=16, model=None, modelName="intfloat/e5-large-v2", - moduleName="e5-large-v2", + # moduleName= modelName.split('/')[1] ): """ Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ super(HuggingFaceSentenceEmbedder, self).__init__() self._setDefault( - inputCol="combined", - outputCol="embeddings", - useTRT=True, - driverOnly=False, + # inputCol="combined", + # outputCol="embeddings", + runtime='cpu', + # driverOnly=False, modelName=modelName, - moduleName=moduleName, + # moduleName=moduleName, model=None, batchSize=16, ) self._set( inputCol=inputCol, outputCol=outputCol, - useTRT=useTRTFlag, - driverOnly=driverOnly, + runtime=runtime, + # driverOnly=driverOnly, batchSize=batchSize, model=model, modelName=modelName, - moduleName=moduleName, + # moduleName=moduleName, ) # Setter method for batchSize @@ -218,23 +221,37 @@ def setBatchSize(self, value): def getBatchSize(self): return self.getOrDefault(self.batchSize) - # Setter method for useTRT - def setUseTRT(self, value): - self._set(useTRT=value) - return self - - # Getter method for useTRT - def getUseTRT(self): - return self.getOrDefault(self.useTRT) - - # Setter method for driverOnly - def setDriverOnly(self, value): - self._set(driverOnly=value) - return self - - # Getter method for driverOnly - def getDriverOnly(self): - return self.getOrDefault(self.driverOnly) + # Sets the runtime environment for the model. + # Supported values: 'cpu', 'cuda', 'tensorrt' + def setRuntime(self, value): + """ + Sets the runtime environment for the model. + Supported values: 'cpu', 'cuda', 'tensorrt' + """ + if value not in ['cpu', 'cuda', 'tensorrt']: + raise ValueError("Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'") + self.setOrDefault(self.runtime, value) + + def getRuntime(self): + return self.getOrDefault(self.runtime) + + # # Setter method for useTRT + # def setUseTRT(self, value): + # self._set(useTRT=value) + # return self + + # # Getter method for useTRT + # def getUseTRT(self): + # return self.getOrDefault(self.useTRT) + + # # Setter method for driverOnly + # def setDriverOnly(self, value): + # self._set(driverOnly=value) + # return self + + # # Getter method for driverOnly + # def getDriverOnly(self): + # return self.getOrDefault(self.driverOnly) # Setter method for model def setModel(self, value): @@ -254,14 +271,14 @@ def setModelName(self, value): def getModelName(self): return self.getOrDefault(self.modelName) - # Setter method for moduleName - def setModuleName(self, value): - self._set(moduleName=value) - return self + # # Setter method for moduleName + # def setModuleName(self, value): + # self._set(moduleName=value) + # return self - # Getter method for moduleName - def getModuleName(self): - return self.getOrDefault(self.moduleName) + # # Getter method for moduleName + # def getModuleName(self): + # return self.getOrDefault(self.moduleName) def _optimize(self, model): """ @@ -281,7 +298,7 @@ def _optimize(self, model): ], ) - def gen_size_chunk(): + def _gen_size_chunk(): """ Generate chunks of different batch sizes and sentence lengths. """ @@ -289,13 +306,13 @@ def gen_size_chunk(): for sentence_length in [20, 300, 512]: yield (batch_size, sentence_length) - def get_dataloader(repeat_times: int = 2): + def _get_dataloader(repeat_times: int = 2): """ Create a data loader with synthetic data using Faker. """ faker = Faker() i = 0 - for batch_size, chunk_size in gen_size_chunk(): + for batch_size, chunk_size in _gen_size_chunk(): for _ in range(repeat_times): yield ( i, @@ -310,14 +327,14 @@ def get_dataloader(repeat_times: int = 2): i += 1 total_batches = len(list(gen_size_chunk())) - func = lambda x, **kwargs: self.SentenceTransformerNavigator.encode( + func = lambda x, **kwargs: self._SentenceTransformerNavigator.encode( model, x, **kwargs ) nav.optimize( func, dataloader=tqdm(get_dataloader(), total=total_batches), config=conf ) - def run_on_driver(self, queries, spark): + def _run_on_driver(self, queries, spark): """ Method to run on the driver to generate embeddings and create a DataFrame. """ @@ -360,11 +377,14 @@ def _predict_batch_fn(self): """ Create and return a function for batch prediction. """ + runtime = self.getRuntime() if "model" not in globals(): global model - if self.useTRT: - model = self.SentenceTransformerNavigator(self.getModelName()).eval() - model = nav.Module(model, name=self.getModuleName()) + modelName = self.getModelName() + if runtime == 'tensorrt': + moduleName = modelName.split('/')[1] + model = self._SentenceTransformerNavigator(modelName).eval() + model = nav.Module(model, name=moduleName) try: nav.load_optimized() except Exception: @@ -372,7 +392,7 @@ def _predict_batch_fn(self): nav.load_optimized() print("create trt model") else: - model = SentenceTransformer(self.getModelName()).eval() + model = SentenceTransformer(modelName).eval() print("create ST model") def predict(inputs): @@ -393,22 +413,31 @@ def _transform(self, dataset, spark): """ Apply the transformation to the input dataset. """ - driverOnly = self.getDriverOnly() - - if driverOnly is None or driverOnly is False: - input_col = self.getInputCol() - output_col = self.getOutputCol() - - encode = predict_batch_udf( - self._predict_batch_fn, - return_type=ArrayType(FloatType()), - batch_size=self.getBatchSize(), - ) - return dataset.withColumn(output_col, encode(input_col)) - else: - if spark is None: - raise ValueError("Spark context should be set") - return self.run_on_driver(dataset, spark=spark) + # driverOnly = self.getDriverOnly() + + # if driverOnly is None or driverOnly is False: + # input_col = self.getInputCol() + # output_col = self.getOutputCol() + + # encode = predict_batch_udf( + # self._predict_batch_fn, + # return_type=ArrayType(FloatType()), + # batch_size=self.getBatchSize(), + # ) + # return dataset.withColumn(output_col, encode(input_col)) + # else: + # if spark is None: + # raise ValueError("Spark context should be set") + # return self.run_on_driver(dataset, spark=spark) + input_col = self.getInputCol() + output_col = self.getOutputCol() + + encode = predict_batch_udf( + self._predict_batch_fn, + return_type=ArrayType(FloatType()), + batch_size=self.getBatchSize(), + ) + return dataset.withColumn(output_col, encode(input_col)) def transform(self, dataset, spark=None): """ @@ -422,55 +451,55 @@ def copy(self, extra=None): """ return self._defaultCopy(extra) - def load_data_food_reviews(self, spark, path=None, limit=1000): - """ - Load data from public dataset and generate 1M rows dataset from 1K data. - """ - if path is None: - path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv" - file_path = path - - # Check if the row count is less than 10 - if limit <= 0 or limit >= 1000000: - raise ValueError(f"Limit is {limit}, which should be less than 1M.") - - df = spark.read.options(inferSchema="True", delimiter=",", header=True).csv( - file_path - ) - df = df.withColumn( - "combined", - F.format_string( - "Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text) - ), - ) - - rowCnt = df.count() - - # Check the conditions - if limit > rowCnt and rowCnt > 1000: - - # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data) - cross_joined_df = df.crossJoin( - df.withColumnRenamed("combined", "combined_2") - ) - - # Create a new column 'result_vector' by concatenating the two source vectors - tmp_df = cross_joined_df.withColumn( - "result_vector", - F.concat(F.col("combined"), F.lit(". \n"), F.col("combined_2")), - ) - - # Select only the necessary columns and show the result - tmp_df = tmp_df.select("result_vector") - df = tmp_df.withColumnRenamed("result_vector", "combined").withColumn( - "id", monotonically_increasing_id() - ) - - # Shuffle the DataFrame with a fixed seed - seed = 42 - shuffled_df = df.orderBy(rand(seed)) - - return shuffled_df.limit(limit) + # def load_data_food_reviews(self, spark, path=None, limit=1000): + # """ + # Load data from public dataset and generate 1M rows dataset from 1K data. + # """ + # if path is None: + # path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv" + # file_path = path + + # # Check if the row count is less than 10 + # if limit <= 0 or limit >= 1000000: + # raise ValueError(f"Limit is {limit}, which should be less than 1M.") + + # df = spark.read.options(inferSchema="True", delimiter=",", header=True).csv( + # file_path + # ) + # df = df.withColumn( + # "combined", + # F.format_string( + # "Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text) + # ), + # ) + + # rowCnt = df.count() + + # # Check the conditions + # if limit > rowCnt and rowCnt > 1000: + + # # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data) + # cross_joined_df = df.crossJoin( + # df.withColumnRenamed("combined", "combined_2") + # ) + + # # Create a new column 'result_vector' by concatenating the two source vectors + # tmp_df = cross_joined_df.withColumn( + # "result_vector", + # F.concat(F.col("combined"), F.lit(". \n"), F.col("combined_2")), + # ) + + # # Select only the necessary columns and show the result + # tmp_df = tmp_df.select("result_vector") + # df = tmp_df.withColumnRenamed("result_vector", "combined").withColumn( + # "id", monotonically_increasing_id() + # ) + + # # Shuffle the DataFrame with a fixed seed + # seed = 42 + # shuffled_df = df.orderBy(rand(seed)) + + # return shuffled_df.limit(limit) # Example usage: diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb index da7f816906..47216915a4 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb @@ -88,6 +88,71 @@ "print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n" ] }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def load_data_food_reviews(self, spark, path=None, limit=1000):\n", + " \"\"\"\n", + " Load data from public dataset and generate 1M rows dataset from 1K data.\n", + " \"\"\"\n", + " if path is None:\n", + " path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv\"\n", + " file_path = path\n", + "\n", + " # Check if the row count is less than 10\n", + " if limit <= 0 or limit >= 1000000:\n", + " raise ValueError(f\"Limit is {limit}, which should be less than 1M.\")\n", + "\n", + " df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(\n", + " file_path\n", + " )\n", + " df = df.withColumn(\n", + " \"combined\",\n", + " F.format_string(\n", + " \"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)\n", + " ),\n", + " )\n", + "\n", + " rowCnt = df.count()\n", + "\n", + " # Check the conditions\n", + " if limit > rowCnt and rowCnt > 1000:\n", + "\n", + " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", + " cross_joined_df = df.crossJoin(\n", + " df.withColumnRenamed(\"combined\", \"combined_2\")\n", + " )\n", + "\n", + " # Create a new column 'result_vector' by concatenating the two source vectors\n", + " tmp_df = cross_joined_df.withColumn(\n", + " \"result_vector\",\n", + " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_2\")),\n", + " )\n", + "\n", + " # Select only the necessary columns and show the result\n", + " tmp_df = tmp_df.select(\"result_vector\")\n", + " df = tmp_df.withColumnRenamed(\"result_vector\", \"combined\").withColumn(\n", + " \"id\", monotonically_increasing_id()\n", + " )\n", + "\n", + " # Shuffle the DataFrame with a fixed seed\n", + " seed = 42\n", + " shuffled_df = df.orderBy(rand(seed))\n", + "\n", + " return shuffled_df.limit(limit)\n" + ] + }, { "cell_type": "markdown", "metadata": { From 09af0e0df3ca6e9690300047340750c038069b14 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 20:10:07 +0000 Subject: [PATCH 04/83] Source temp fixes --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 109 +++++++++--------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index b73f03eb71..f8c8d5e2c5 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -62,9 +62,9 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): # "True if run encode only on Driver for small queries", # ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) - modelName = Param(Params._dummy(), "modelName", "Model Name parameter") - moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") - model = Param(Params._dummy(), "model", "Model used for embedding") + modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") + # moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") + # model = Param(Params._dummy(), "model", "Model used for embedding") path = Param(Params._dummy(), "path", "Path to .csv file with data") class _SentenceTransformerNavigator(SentenceTransformer): @@ -183,8 +183,9 @@ def __init__( runtime=None, # driverOnly=False, batchSize=16, - model=None, - modelName="intfloat/e5-large-v2", + # model=None, + # modelName="intfloat/e5-large-v2", + modelName=None, # moduleName= modelName.split('/')[1] ): """ @@ -198,7 +199,7 @@ def __init__( # driverOnly=False, modelName=modelName, # moduleName=moduleName, - model=None, + # model=None, batchSize=16, ) self._set( @@ -207,7 +208,7 @@ def __init__( runtime=runtime, # driverOnly=driverOnly, batchSize=batchSize, - model=model, + # model=model, modelName=modelName, # moduleName=moduleName, ) @@ -253,14 +254,14 @@ def getRuntime(self): # def getDriverOnly(self): # return self.getOrDefault(self.driverOnly) - # Setter method for model - def setModel(self, value): - self._paramMap[self.model] = value - return self + # # Setter method for model + # def setModel(self, value): + # self._paramMap[self.model] = value + # return self - # Getter method for model - def getModel(self): - return self.getOrDefault(self.model) + # # Getter method for model + # def getModel(self): + # return self.getOrDefault(self.model) # Setter method for modelName def setModelName(self, value): @@ -302,7 +303,7 @@ def _gen_size_chunk(): """ Generate chunks of different batch sizes and sentence lengths. """ - for batch_size in [64, 32, 16, 8, 4, 2, 1]: + for batch_size in [64]: for sentence_length in [20, 300, 512]: yield (batch_size, sentence_length) @@ -326,52 +327,52 @@ def _get_dataloader(repeat_times: int = 2): ) i += 1 - total_batches = len(list(gen_size_chunk())) + total_batches = len(list(_gen_size_chunk())) func = lambda x, **kwargs: self._SentenceTransformerNavigator.encode( model, x, **kwargs ) nav.optimize( - func, dataloader=tqdm(get_dataloader(), total=total_batches), config=conf + func, dataloader=tqdm(_get_dataloader(), total=total_batches), config=conf ) - def _run_on_driver(self, queries, spark): - """ - Method to run on the driver to generate embeddings and create a DataFrame. - """ - if spark is None: - raise ValueError("Spark context should be set") - - # Load the model on the driver - model = SentenceTransformer(self.getModelName()).eval() - - # Generate embeddings - with torch.no_grad(): - embeddings = [embedding.tolist() for embedding in model.encode(queries)] - - # Prepare data including IDs - data_with_ids = [ - (i + 1, query, embeddings[i]) for i, query in enumerate(queries) - ] - - # Define the schema for the DataFrame - schema = StructType( - [ - StructField("id", IntegerType(), nullable=False), - StructField("query", StringType(), nullable=False), - StructField( - "embeddings", - ArrayType(FloatType(), containsNull=False), - nullable=False, - ), - ] - ) + # def _run_on_driver(self, queries, spark): + # """ + # Method to run on the driver to generate embeddings and create a DataFrame. + # """ + # if spark is None: + # raise ValueError("Spark context should be set") + + # # Load the model on the driver + # model = SentenceTransformer(self.getModelName()).eval() + + # # Generate embeddings + # with torch.no_grad(): + # embeddings = [embedding.tolist() for embedding in model.encode(queries)] + + # # Prepare data including IDs + # data_with_ids = [ + # (i + 1, query, embeddings[i]) for i, query in enumerate(queries) + # ] + + # # Define the schema for the DataFrame + # schema = StructType( + # [ + # StructField("id", IntegerType(), nullable=False), + # StructField("query", StringType(), nullable=False), + # StructField( + # "embeddings", + # ArrayType(FloatType(), containsNull=False), + # nullable=False, + # ), + # ] + # ) - # Create a DataFrame using the data with IDs and the schema - query_embeddings = spark.createDataFrame( - data=data_with_ids, schema=schema - ).cache() + # # Create a DataFrame using the data with IDs and the schema + # query_embeddings = spark.createDataFrame( + # data=data_with_ids, schema=schema + # ).cache() - return query_embeddings + # return query_embeddings def _predict_batch_fn(self): """ @@ -384,7 +385,7 @@ def _predict_batch_fn(self): if runtime == 'tensorrt': moduleName = modelName.split('/')[1] model = self._SentenceTransformerNavigator(modelName).eval() - model = nav.Module(model, name=moduleName) + model = nav.Module(model, name=model.name) try: nav.load_optimized() except Exception: From a485010884d051ee29b54eb5fb70fb0d04f552b8 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 20:24:52 +0000 Subject: [PATCH 05/83] Formating --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index f8c8d5e2c5..c07218345c 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -45,6 +45,7 @@ FloatType, ) + class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ Custom transformer that extends PySpark's Transformer class to @@ -54,7 +55,11 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): # Define additional parameters # useTRT = Param(Params._dummy(), "useTRT", "True if use TRT acceleration") - runtime = Param(Params._dummy(), "runtime", "Specifies the runtime environment: cpu, cuda, or tensorrt") + runtime = Param( + Params._dummy(), + "runtime", + "Specifies the runtime environment: cpu, cuda, or tensorrt", + ) # driverOnly = Param( # Params._dummy(), @@ -195,7 +200,7 @@ def __init__( self._setDefault( # inputCol="combined", # outputCol="embeddings", - runtime='cpu', + runtime="cpu", # driverOnly=False, modelName=modelName, # moduleName=moduleName, @@ -229,10 +234,12 @@ def setRuntime(self, value): Sets the runtime environment for the model. Supported values: 'cpu', 'cuda', 'tensorrt' """ - if value not in ['cpu', 'cuda', 'tensorrt']: - raise ValueError("Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'") + if value not in ["cpu", "cuda", "tensorrt"]: + raise ValueError( + "Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'" + ) self.setOrDefault(self.runtime, value) - + def getRuntime(self): return self.getOrDefault(self.runtime) @@ -382,8 +389,8 @@ def _predict_batch_fn(self): if "model" not in globals(): global model modelName = self.getModelName() - if runtime == 'tensorrt': - moduleName = modelName.split('/')[1] + if runtime == "tensorrt": + moduleName = modelName.split("/")[1] model = self._SentenceTransformerNavigator(modelName).eval() model = nav.Module(model, name=model.name) try: @@ -507,4 +514,4 @@ def copy(self, extra=None): # data = input data frame # transformer = EmbeddingTransformer(inputCol="combined", outputCol="embeddings") # result = transformer.transform(data) -# result.show() +# result.show() \ No newline at end of file From eae2aeeaa6ef22c67bb23cba6f58526ace39206e Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 22:24:05 +0000 Subject: [PATCH 06/83] First test --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 5 +- .../Quickstart - Custom Embeddins.ipynb | 220 ++++++++++-------- 2 files changed, 120 insertions(+), 105 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index c07218345c..e167f29d06 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -70,7 +70,7 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") # moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") # model = Param(Params._dummy(), "model", "Model used for embedding") - path = Param(Params._dummy(), "path", "Path to .csv file with data") + # path = Param(Params._dummy(), "path", "Path to .csv file with data") class _SentenceTransformerNavigator(SentenceTransformer): """ @@ -392,7 +392,8 @@ def _predict_batch_fn(self): if runtime == "tensorrt": moduleName = modelName.split("/")[1] model = self._SentenceTransformerNavigator(modelName).eval() - model = nav.Module(model, name=model.name) + # model = nav.Module(model, name=model.name) + model = nav.Module(model, name=moduleName) try: nav.load_optimized() except Exception: diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb index 47216915a4..c814bd7d96 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb @@ -59,6 +59,11 @@ }, "outputs": [], "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, module='tritonclient.grpc')\n", + "import logging\n", + "logging.getLogger('py4j').setLevel(logging.ERROR)\n", + "\n", "import sys\n", "# Assuming 'deep-learning' is at the root of your Databricks Repo\n", "repo_path = '/Workspace/Repos/aspiridonov@nvidia.com/SynapseML-db1/deep-learning/src/main/python'\n", @@ -66,91 +71,29 @@ " sys.path.append(repo_path)\n", "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "\n", - "import warnings\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module='tritonclient.grpc')\n", - "import logging\n", - "logging.getLogger('py4j').setLevel(logging.ERROR)\n", + "\n", "import mlflow\n", - "import datetime\n", - "import pytz\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "from datetime import datetime\n", + "from pyspark.sql.types import (\n", + " StructType,\n", + " StructField,\n", + " IntegerType,\n", + " StringType,\n", + " ArrayType,\n", + " FloatType,\n", + ")\n", + "\n", "from spark_rapids_ml.knn import ApproximateNearestNeighbors, ApproximateNearestNeighborsModel\n", "\n", "logging.getLogger('sentence_transformers.SentenceTransformer').setLevel(logging.ERROR)\n", "mlflow.autolog(disable=True)\n", "\n", - "# Define the PST timezone\n", - "pst_timezone = pytz.timezone('US/Pacific')\n", - "\n", - "# Get the current time in UTC and convert it to PST\n", - "current_start_time_utc = datetime.datetime.now(pytz.utc)\n", - "current_time_pst = current_start_time_utc.astimezone(pst_timezone)\n", + "# Record the start time\n", + "start_time = datetime.now()\n", "\n", - "print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "def load_data_food_reviews(self, spark, path=None, limit=1000):\n", - " \"\"\"\n", - " Load data from public dataset and generate 1M rows dataset from 1K data.\n", - " \"\"\"\n", - " if path is None:\n", - " path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv\"\n", - " file_path = path\n", - "\n", - " # Check if the row count is less than 10\n", - " if limit <= 0 or limit >= 1000000:\n", - " raise ValueError(f\"Limit is {limit}, which should be less than 1M.\")\n", - "\n", - " df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(\n", - " file_path\n", - " )\n", - " df = df.withColumn(\n", - " \"combined\",\n", - " F.format_string(\n", - " \"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)\n", - " ),\n", - " )\n", - "\n", - " rowCnt = df.count()\n", - "\n", - " # Check the conditions\n", - " if limit > rowCnt and rowCnt > 1000:\n", - "\n", - " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", - " cross_joined_df = df.crossJoin(\n", - " df.withColumnRenamed(\"combined\", \"combined_2\")\n", - " )\n", - "\n", - " # Create a new column 'result_vector' by concatenating the two source vectors\n", - " tmp_df = cross_joined_df.withColumn(\n", - " \"result_vector\",\n", - " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_2\")),\n", - " )\n", - "\n", - " # Select only the necessary columns and show the result\n", - " tmp_df = tmp_df.select(\"result_vector\")\n", - " df = tmp_df.withColumnRenamed(\"result_vector\", \"combined\").withColumn(\n", - " \"id\", monotonically_increasing_id()\n", - " )\n", - "\n", - " # Shuffle the DataFrame with a fixed seed\n", - " seed = 42\n", - " shuffled_df = df.orderBy(rand(seed))\n", - "\n", - " return shuffled_df.limit(limit)\n" + "print(f\"Demo started\")" ] }, { @@ -162,15 +105,19 @@ "rowLimit": 10000 }, "inputWidgets": {}, - "nuid": "22603456-4f44-4b3a-9751-9ca5231b799b", + "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, "title": "" } }, "source": [ - "## Step 2: Load Data\n", + "## Step 2: Load Input Data\n", + "\n", + "It will load public dataset and generate extra syntetic rows if set by size parameter\n", + "\n", + "The loaded dataset has 1000 rows. If you specify number_of_input_rows in [1..1000] it will cut extra rows if needed\n", "\n", - "In this demo we will explore a dataset of fine food reviews" + "If number_of_input_rows in [1000..1000000] it will generate extra rows using cross join of original data" ] }, { @@ -183,17 +130,57 @@ "rowLimit": 10000 }, "inputWidgets": {}, - "nuid": "a0065427-a0d5-4867-8209-61969dc48082", + "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c", "showTitle": false, "title": "" } }, "outputs": [], "source": [ - "dataTransformer = HuggingFaceSentenceEmbedder(inputCol=\"combined\", outputCol=\"embeddings\", useTRTFlag=True, batchSize=16)\n", + "file_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv\"\n", + "\n", + "df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(\n", + " file_path\n", + ")\n", + "df = df.withColumn(\n", + " \"combined\",\n", + " F.format_string(\n", + " \"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)\n", + " ),\n", + ")\n", + "\n", + "number_of_input_rows = 100\n", + "\n", + "# Check if the row count is less than 10\n", + "if number_of_input_rows <= 0 or number_of_input_rows >= 1000000:\n", + " raise ValueError(f\"Limit is {number_of_input_rows}, which should be less than 1M.\")\n", + "\n", + "if number_of_input_rows > 1000:\n", + "\n", + " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", + " cross_joined_df = df.crossJoin(\n", + " df.withColumnRenamed(\"combined\", \"combined_2\")\n", + " )\n", + "\n", + " # Create a new column 'result_vector' by concatenating the two source vectors\n", + " tmp_df = cross_joined_df.withColumn(\n", + " \"result_vector\",\n", + " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_2\")),\n", + " )\n", + "\n", + " # Select only the necessary columns and show the result\n", + " tmp_df = tmp_df.select(\"result_vector\")\n", + " df = tmp_df.withColumnRenamed(\"result_vector\", \"combined\").withColumn(\n", + " \"id\", monotonically_increasing_id()\n", + " )\n", + "\n", + "# # Shuffle the DataFrame with a fixed seed\n", + "# seed = 42\n", + "# shuffled_df = df.orderBy(rand(seed))\n", "\n", - "# Load food revies with limiting number of rows until 1000000\n", - "df = dataTransformer.load_data_food_reviews(spark=spark, limit=100).repartition(10).cache()" + "df = df.limit(number_of_input_rows).repartition(10).cache()\n", + "\n", + "print(f\"Loaded: {number_of_input_rows} rows\")" ] }, { @@ -233,7 +220,9 @@ }, "outputs": [], "source": [ - "all_embeddings = dataTransformer.transform(df)" + "dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "\n", + "all_embeddings = dataTransformer.transform(df).cache()" ] }, { @@ -273,14 +262,26 @@ }, "outputs": [], "source": [ - "# Sample queries\n", + "\n", + "# Sample query\n", "queries = [\"desserts\", \"disgusting\"]\n", + "ids = [1, 2]\n", + "\n", + "# Combine the data into a list of tuples\n", + "data = list(zip(ids, queries))\n", + "\n", + "# Define the schema for the DataFrame\n", + "schema = StructType([\n", + " StructField(\"id\", IntegerType(), nullable=False),\n", + " StructField(\"query\", StringType(), nullable=False)\n", + "])\n", + "\n", + "# Create the DataFrame\n", + "qDf = spark.createDataFrame(data, schema)\n", "\n", - "# Create an instance of the EmbeddingTransformer to encode embeddings on drive only\n", - "# to speed it up processing small amout of queries\n", - "#embedding_transformer = EmbeddingTransformer(driverOnly=True, spark=spark)\n", - "embedding_transformer = HuggingFaceSentenceEmbedder(driverOnly=True)\n", - "query_embeddings = embedding_transformer.transform(queries, spark=spark)" + "queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", + "\n", + "query_embeddings = queryTransformer.transform(qDf).cache()" ] }, { @@ -405,21 +406,34 @@ "\n", "print(f\"Demo finished\")\n", "\n", - "# Get the current time in UTC and convert it to PST\n", - "current_end_time_utc = datetime.datetime.now(pytz.utc)\n", - "current_time_pst = current_end_time_utc.astimezone(pst_timezone)\n", + "# # Get the current time in UTC and convert it to PST\n", + "# current_end_time_utc = datetime.datetime.now(pytz.utc)\n", + "# current_time_pst = current_end_time_utc.astimezone(pst_timezone)\n", + "\n", + "# print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n", + "\n", + "# dif = current_end_time_utc - current_start_time_utc\n", + "\n", + "# # Extract hours, minutes, and seconds from the difference\n", + "# total_seconds = int(dif.total_seconds())\n", + "# hours, remainder = divmod(total_seconds, 3600)\n", + "# minutes, seconds = divmod(remainder, 60)\n", + "\n", + "# Record the end time\n", + "end_time = datetime.now()\n", + "\n", + "# Calculate the duration\n", + "duration = end_time - start_time\n", "\n", - "print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n", + "# # Display the duration\n", + "# print(f\"Application duration: {duration}\")\n", "\n", - "dif = current_end_time_utc - current_start_time_utc\n", + "# Optionally, display the duration in seconds\n", + "duration_in_seconds = duration.total_seconds()\n", + "print(f\"Application duration: {duration_in_seconds:.2f} seconds\")\n", "\n", - "# Extract hours, minutes, and seconds from the difference\n", - "total_seconds = int(dif.total_seconds())\n", - "hours, remainder = divmod(total_seconds, 3600)\n", - "minutes, seconds = divmod(remainder, 60)\n", "\n", - "# Print the difference in the desired format\n", - "print(f\"Difference: h: {hours}, min: {minutes}, sec: {seconds}\")" + "\n" ] } ], From 2cdfb5998a533962b5c608d4e119a063df75d6f3 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 22:27:59 +0000 Subject: [PATCH 07/83] Name changes --- ...ns.ipynb => Quickstart - NVIDIA Accelerated Embeddins.ipynb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - Custom Embeddins.ipynb => Quickstart - NVIDIA Accelerated Embeddins.ipynb} (99%) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb similarity index 99% rename from docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb index c814bd7d96..24f3bffcc1 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb @@ -452,7 +452,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - Custom Embeddins", + "notebookName": "Quickstart - NVIDIA Accelerated Embeddins", "widgets": {} }, "kernel_info": { From 88d34b895e55a2d896ebe1cca6f0031f5d2ef6b3 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 14 Jun 2024 23:16:11 +0000 Subject: [PATCH 08/83] With two models --- ...start - NVIDIA Accelerated Embeddins.ipynb | 31 ++++--------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb index 24f3bffcc1..b113d954dd 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb @@ -71,7 +71,6 @@ " sys.path.append(repo_path)\n", "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "\n", - "\n", "import mlflow\n", "import pyspark.sql.functions as F\n", "from pyspark.sql.functions import monotonically_increasing_id\n", @@ -149,7 +148,7 @@ " ),\n", ")\n", "\n", - "number_of_input_rows = 100\n", + "number_of_input_rows = 100000\n", "\n", "# Check if the row count is less than 10\n", "if number_of_input_rows <= 0 or number_of_input_rows >= 1000000:\n", @@ -220,7 +219,8 @@ }, "outputs": [], "source": [ - "dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "# dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", "\n", "all_embeddings = dataTransformer.transform(df).cache()" ] @@ -279,8 +279,8 @@ "# Create the DataFrame\n", "qDf = spark.createDataFrame(data, schema)\n", "\n", - "queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", - "\n", + "# queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", + "queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", "query_embeddings = queryTransformer.transform(qDf).cache()" ] }, @@ -406,34 +406,15 @@ "\n", "print(f\"Demo finished\")\n", "\n", - "# # Get the current time in UTC and convert it to PST\n", - "# current_end_time_utc = datetime.datetime.now(pytz.utc)\n", - "# current_time_pst = current_end_time_utc.astimezone(pst_timezone)\n", - "\n", - "# print(\"Current time in PST:\", current_time_pst.strftime('%Y-%m-%d %H:%M:%S %Z%z'))\n", - "\n", - "# dif = current_end_time_utc - current_start_time_utc\n", - "\n", - "# # Extract hours, minutes, and seconds from the difference\n", - "# total_seconds = int(dif.total_seconds())\n", - "# hours, remainder = divmod(total_seconds, 3600)\n", - "# minutes, seconds = divmod(remainder, 60)\n", - "\n", "# Record the end time\n", "end_time = datetime.now()\n", "\n", "# Calculate the duration\n", "duration = end_time - start_time\n", "\n", - "# # Display the duration\n", - "# print(f\"Application duration: {duration}\")\n", - "\n", "# Optionally, display the duration in seconds\n", "duration_in_seconds = duration.total_seconds()\n", - "print(f\"Application duration: {duration_in_seconds:.2f} seconds\")\n", - "\n", - "\n", - "\n" + "print(f\"Application duration: {duration_in_seconds:.2f} seconds\")" ] } ], From 052f39bf5a53e9d9cf5ac695f680b1b393bf161b Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sun, 16 Jun 2024 04:53:02 +0000 Subject: [PATCH 09/83] Source style corrections --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 175 +----------------- ...rt - GPU Custom Embeddings with ANN.ipynb} | 2 +- 2 files changed, 3 insertions(+), 174 deletions(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - NVIDIA Accelerated Embeddins.ipynb => Quickstart - GPU Custom Embeddings with ANN.ipynb} (99%) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index e167f29d06..3a6110adb7 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -45,7 +45,6 @@ FloatType, ) - class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ Custom transformer that extends PySpark's Transformer class to @@ -53,24 +52,13 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ # Define additional parameters - # useTRT = Param(Params._dummy(), "useTRT", "True if use TRT acceleration") - runtime = Param( Params._dummy(), "runtime", "Specifies the runtime environment: cpu, cuda, or tensorrt", ) - - # driverOnly = Param( - # Params._dummy(), - # "driverOnly", - # "True if run encode only on Driver for small queries", - # ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") - # moduleName = Param(Params._dummy(), "moduleName", "Module Name parameter") - # model = Param(Params._dummy(), "model", "Model used for embedding") - # path = Param(Params._dummy(), "path", "Path to .csv file with data") class _SentenceTransformerNavigator(SentenceTransformer): """ @@ -186,36 +174,24 @@ def __init__( inputCol=None, outputCol=None, runtime=None, - # driverOnly=False, batchSize=16, - # model=None, - # modelName="intfloat/e5-large-v2", modelName=None, - # moduleName= modelName.split('/')[1] ): """ Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ super(HuggingFaceSentenceEmbedder, self).__init__() self._setDefault( - # inputCol="combined", - # outputCol="embeddings", runtime="cpu", - # driverOnly=False, modelName=modelName, - # moduleName=moduleName, - # model=None, batchSize=16, ) self._set( inputCol=inputCol, outputCol=outputCol, runtime=runtime, - # driverOnly=driverOnly, batchSize=batchSize, - # model=model, modelName=modelName, - # moduleName=moduleName, ) # Setter method for batchSize @@ -243,33 +219,6 @@ def setRuntime(self, value): def getRuntime(self): return self.getOrDefault(self.runtime) - # # Setter method for useTRT - # def setUseTRT(self, value): - # self._set(useTRT=value) - # return self - - # # Getter method for useTRT - # def getUseTRT(self): - # return self.getOrDefault(self.useTRT) - - # # Setter method for driverOnly - # def setDriverOnly(self, value): - # self._set(driverOnly=value) - # return self - - # # Getter method for driverOnly - # def getDriverOnly(self): - # return self.getOrDefault(self.driverOnly) - - # # Setter method for model - # def setModel(self, value): - # self._paramMap[self.model] = value - # return self - - # # Getter method for model - # def getModel(self): - # return self.getOrDefault(self.model) - # Setter method for modelName def setModelName(self, value): self._set(modelName=value) @@ -279,19 +228,8 @@ def setModelName(self, value): def getModelName(self): return self.getOrDefault(self.modelName) - # # Setter method for moduleName - # def setModuleName(self, value): - # self._set(moduleName=value) - # return self - - # # Getter method for moduleName - # def getModuleName(self): - # return self.getOrDefault(self.moduleName) - + # Optimize the model using Model Navigator with TensorRT configuration. def _optimize(self, model): - """ - Optimize the model using Model Navigator with TensorRT configuration. - """ conf = nav.OptimizeConfig( target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), @@ -335,52 +273,13 @@ def _get_dataloader(repeat_times: int = 2): i += 1 total_batches = len(list(_gen_size_chunk())) - func = lambda x, **kwargs: self._SentenceTransformerNavigator.encode( + func = lambda x, **kwargs: self._SentenceTransformerNavigator._encode( model, x, **kwargs ) nav.optimize( func, dataloader=tqdm(_get_dataloader(), total=total_batches), config=conf ) - # def _run_on_driver(self, queries, spark): - # """ - # Method to run on the driver to generate embeddings and create a DataFrame. - # """ - # if spark is None: - # raise ValueError("Spark context should be set") - - # # Load the model on the driver - # model = SentenceTransformer(self.getModelName()).eval() - - # # Generate embeddings - # with torch.no_grad(): - # embeddings = [embedding.tolist() for embedding in model.encode(queries)] - - # # Prepare data including IDs - # data_with_ids = [ - # (i + 1, query, embeddings[i]) for i, query in enumerate(queries) - # ] - - # # Define the schema for the DataFrame - # schema = StructType( - # [ - # StructField("id", IntegerType(), nullable=False), - # StructField("query", StringType(), nullable=False), - # StructField( - # "embeddings", - # ArrayType(FloatType(), containsNull=False), - # nullable=False, - # ), - # ] - # ) - - # # Create a DataFrame using the data with IDs and the schema - # query_embeddings = spark.createDataFrame( - # data=data_with_ids, schema=schema - # ).cache() - - # return query_embeddings - def _predict_batch_fn(self): """ Create and return a function for batch prediction. @@ -392,17 +291,14 @@ def _predict_batch_fn(self): if runtime == "tensorrt": moduleName = modelName.split("/")[1] model = self._SentenceTransformerNavigator(modelName).eval() - # model = nav.Module(model, name=model.name) model = nav.Module(model, name=moduleName) try: nav.load_optimized() except Exception: self._optimize(model) nav.load_optimized() - print("create trt model") else: model = SentenceTransformer(modelName).eval() - print("create ST model") def predict(inputs): """ @@ -422,22 +318,6 @@ def _transform(self, dataset, spark): """ Apply the transformation to the input dataset. """ - # driverOnly = self.getDriverOnly() - - # if driverOnly is None or driverOnly is False: - # input_col = self.getInputCol() - # output_col = self.getOutputCol() - - # encode = predict_batch_udf( - # self._predict_batch_fn, - # return_type=ArrayType(FloatType()), - # batch_size=self.getBatchSize(), - # ) - # return dataset.withColumn(output_col, encode(input_col)) - # else: - # if spark is None: - # raise ValueError("Spark context should be set") - # return self.run_on_driver(dataset, spark=spark) input_col = self.getInputCol() output_col = self.getOutputCol() @@ -460,57 +340,6 @@ def copy(self, extra=None): """ return self._defaultCopy(extra) - # def load_data_food_reviews(self, spark, path=None, limit=1000): - # """ - # Load data from public dataset and generate 1M rows dataset from 1K data. - # """ - # if path is None: - # path = "wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv" - # file_path = path - - # # Check if the row count is less than 10 - # if limit <= 0 or limit >= 1000000: - # raise ValueError(f"Limit is {limit}, which should be less than 1M.") - - # df = spark.read.options(inferSchema="True", delimiter=",", header=True).csv( - # file_path - # ) - # df = df.withColumn( - # "combined", - # F.format_string( - # "Title: %s; Content: %s", F.trim(df.Summary), F.trim(df.Text) - # ), - # ) - - # rowCnt = df.count() - - # # Check the conditions - # if limit > rowCnt and rowCnt > 1000: - - # # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data) - # cross_joined_df = df.crossJoin( - # df.withColumnRenamed("combined", "combined_2") - # ) - - # # Create a new column 'result_vector' by concatenating the two source vectors - # tmp_df = cross_joined_df.withColumn( - # "result_vector", - # F.concat(F.col("combined"), F.lit(". \n"), F.col("combined_2")), - # ) - - # # Select only the necessary columns and show the result - # tmp_df = tmp_df.select("result_vector") - # df = tmp_df.withColumnRenamed("result_vector", "combined").withColumn( - # "id", monotonically_increasing_id() - # ) - - # # Shuffle the DataFrame with a fixed seed - # seed = 42 - # shuffled_df = df.orderBy(rand(seed)) - - # return shuffled_df.limit(limit) - - # Example usage: # data = input data frame # transformer = EmbeddingTransformer(inputCol="combined", outputCol="embeddings") diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb similarity index 99% rename from docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb index b113d954dd..1a1c7f0a0d 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - NVIDIA Accelerated Embeddins.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb @@ -433,7 +433,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - NVIDIA Accelerated Embeddins", + "notebookName": "Quickstart - GPU Custom Embeddings with ANN", "widgets": {} }, "kernel_info": { From ac7bc672b39beb056813f6bc7cdcac0356c4a994 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sun, 16 Jun 2024 18:05:23 +0000 Subject: [PATCH 10/83] Name change --- ...nb => Quickstart - GPU Accelerated Embeddings and KNN.ipynb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - GPU Custom Embeddings with ANN.ipynb => Quickstart - GPU Accelerated Embeddings and KNN.ipynb} (99%) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb similarity index 99% rename from docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb index 1a1c7f0a0d..724dc98f51 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - GPU Custom Embeddings with ANN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb @@ -433,7 +433,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - GPU Custom Embeddings with ANN", + "notebookName": "Quickstart - GPU Accelerated Embeddings and KNN", "widgets": {} }, "kernel_info": { From 52a15818f029b20a7b18a8969d4bb5131a288e99 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sun, 16 Jun 2024 22:53:51 +0000 Subject: [PATCH 11/83] Name change --- ...=> Quickstart - Custom Embeddings and Approximate KNN.ipynb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - GPU Accelerated Embeddings and KNN.ipynb => Quickstart - Custom Embeddings and Approximate KNN.ipynb} (99%) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb similarity index 99% rename from docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 724dc98f51..76ddc5fde1 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - GPU Accelerated Embeddings and KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -433,7 +433,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - GPU Accelerated Embeddings and KNN", + "notebookName": "Quickstart - Custom Embeddings and Approximate KNN", "widgets": {} }, "kernel_info": { From 1bd45ab679acde05c8141b634ce4af7304d5563d Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 17 Jun 2024 19:19:58 +0000 Subject: [PATCH 12/83] Merge init scripts --- tools/init_scripts/init-rapidsml-cuda-11.8.sh | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index dd702d3e4b..2822d74aa4 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -1,11 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + #!/bin/bash -# set portion of path below after /dbfs/ to dbfs zip file location -SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) RAPIDS_VERSION=23.10.0 SPARK_RAPIDS_VERSION=23.10.0 +SPARK_RAPIDSML_VERSION=24.04 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar @@ -28,6 +41,8 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda rmm-cu11~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com -# install spark-rapids-ml -/databricks/python/bin/pip install spark-rapids-ml +# install model navigator +/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com onnxruntime-gpu==1.16.3 "tensorrt==9.3.0.post12.dev1" "triton-model-navigator<1" "sentence_transformers~=2.2.2" "faker" "urllib3<2" +# install spark-rapids-ml +/databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} From cce365a9b734fd0ca90453c4a1feec2f4685f594 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 17 Jun 2024 19:23:07 +0000 Subject: [PATCH 13/83] Removed extra file --- tools/init_scripts/init_retriever.sh | 56 ---------------------------- 1 file changed, 56 deletions(-) delete mode 100644 tools/init_scripts/init_retriever.sh diff --git a/tools/init_scripts/init_retriever.sh b/tools/init_scripts/init_retriever.sh deleted file mode 100644 index 1204895eb1..0000000000 --- a/tools/init_scripts/init_retriever.sh +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/bin/bash - -echo "Starting init script execution..." - -RAPIDS_VERSION=23.10.0 -SPARK_RAPIDS_VERSION=23.10.0 -SPARK_RAPIDSML_VERSION=24.04 -NAVIGATOR_CACHE=/dbfs/FileStore/model_navigator.zip -TORCH_CACHE=/dbfs/FileStore/torch.zip - -# To eliminate optimization stage -# Unzip optimized TRT Navigator model and Transformer -# unzip ${NAVIGATOR_CACHE} -d /root/.cache -# unzip ${TORCH_CACHE} -d /root/.cache - -# install cudatoolkit 11.8 via runfile approach -wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run -sh cuda_11.8.0_520.61.05_linux.run --silent --toolkit - -# reset symlink and update library loading paths -rm /usr/local/cuda -ln -s /usr/local/cuda-11.8 /usr/local/cuda - -# upgrade pip -/databricks/python/bin/pip install --upgrade pip - -# install cudf, cuml and their rapids dependencies -# using ~= pulls in latest micro version patches -/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} pylibraft-cu11~=${RAPIDS_VERSION} rmm-cu11~=${RAPIDS_VERSION} - -# install model navigator -/databricks/python/bin/pip install --extra-index-url https://pypi.nvidia.com onnxruntime-gpu==1.16.3 "tensorrt==9.3.0.post12.dev1" "triton-model-navigator<1" "sentence_transformers~=2.2.2" "faker" "urllib3<2" - -# install spark-rapids-ml -/databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} - -# upgrade grpc -# /databricks/python/bin/pip uninstall -y grpcio grpcio-tools -# Install the specific version of grpcio -/databricks/python/bin/pip install grpcio==1.64.1 - -echo "Init script execution completed." \ No newline at end of file From 8d2ed06d873802698c7337680a6430928ab07f63 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 18 Jun 2024 00:20:17 +0000 Subject: [PATCH 14/83] Added result output and _ correction --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 2 +- ...ustom Embeddings and Approximate KNN.ipynb | 74 +++++++++++++++---- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 3a6110adb7..789abecac0 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -273,7 +273,7 @@ def _get_dataloader(repeat_times: int = 2): i += 1 total_batches = len(list(_gen_size_chunk())) - func = lambda x, **kwargs: self._SentenceTransformerNavigator._encode( + func = lambda x, **kwargs: self._SentenceTransformerNavigator.encode( model, x, **kwargs ) nav.optimize( diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 76ddc5fde1..95a15f4692 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -73,6 +73,7 @@ "\n", "import mlflow\n", "import pyspark.sql.functions as F\n", + "from pyspark.sql.functions import rand\n", "from pyspark.sql.functions import monotonically_increasing_id\n", "from datetime import datetime\n", "from pyspark.sql.types import (\n", @@ -83,6 +84,8 @@ " ArrayType,\n", " FloatType,\n", ")\n", + "import pandas as pd\n", + "from pyspark.sql.functions import explode, col, arrays_zip\n", "\n", "from spark_rapids_ml.knn import ApproximateNearestNeighbors, ApproximateNearestNeighborsModel\n", "\n", @@ -114,9 +117,9 @@ "\n", "It will load public dataset and generate extra syntetic rows if set by size parameter\n", "\n", - "The loaded dataset has 1000 rows. If you specify number_of_input_rows in [1..1000] it will cut extra rows if needed\n", + "The loaded dataset has 1000 rows. If you specify number_of_input_rows in [1..1000] it will cut extra rows if needed\n", "\n", - "If number_of_input_rows in [1000..1000000] it will generate extra rows using cross join of original data" + "If number_of_input_rows in [1000..1000000] it will generate extra rows using cross join of original data" ] }, { @@ -148,7 +151,10 @@ " ),\n", ")\n", "\n", - "number_of_input_rows = 100000\n", + "# Size of DF\n", + "number_of_input_rows = 999\n", + "# Shuffle the DataFrame with a fixed seed\n", + "seed = 42\n", "\n", "# Check if the row count is less than 10\n", "if number_of_input_rows <= 0 or number_of_input_rows >= 1000000:\n", @@ -158,13 +164,13 @@ "\n", " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", " cross_joined_df = df.crossJoin(\n", - " df.withColumnRenamed(\"combined\", \"combined_2\")\n", + " df.withColumnRenamed(\"combined\", \"combined_\")\n", " )\n", "\n", " # Create a new column 'result_vector' by concatenating the two source vectors\n", " tmp_df = cross_joined_df.withColumn(\n", " \"result_vector\",\n", - " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_2\")),\n", + " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_\")),\n", " )\n", "\n", " # Select only the necessary columns and show the result\n", @@ -173,15 +179,31 @@ " \"id\", monotonically_increasing_id()\n", " )\n", "\n", - "# # Shuffle the DataFrame with a fixed seed\n", - "# seed = 42\n", - "# shuffled_df = df.orderBy(rand(seed))\n", - "\n", - "df = df.limit(number_of_input_rows).repartition(10).cache()\n", + "df = df.limit(number_of_input_rows).orderBy(rand(seed)).repartition(10).cache()\n", "\n", "print(f\"Loaded: {number_of_input_rows} rows\")" ] }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "36a33406-ca31-42f2-b646-fd50311c7cb4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(df)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -199,7 +221,7 @@ "source": [ "## Step 3: Generate Embeddings\n", "\n", - "We will first generate embeddings using NVIDIA TensorRT optimized SentenceTransformer" + "We will first generate embeddings using NVIDIA TensorRT optimized SentenceTransformer. In the demo you can use two fifferent HF models: intfloat/e5-large-v2 or sentence-transformers/all-MiniLM-L6-v2\"" ] }, { @@ -395,15 +417,41 @@ "rowLimit": 10000 }, "inputWidgets": {}, - "nuid": "3c3cb089-64fd-4089-b45c-4d1671d2c500", + "nuid": "2ec6847c-3592-4645-aca1-0fc6d9e3ed0f", "showTitle": false, "title": "" } }, "outputs": [], "source": [ - "display(knn_df)\n", + "result_df = (\n", + " knn_df\n", + " .withColumn(\"zipped\", explode(arrays_zip(col(\"indices\"), col(\"distances\"))))\n", + " .select(col(\"query_id\"), col(\"zipped.indices\").alias(\"id\"), col(\"zipped.distances\").alias(\"distance\"))\n", + " .join(df, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"combined\", \"distance\")\n", + ")\n", "\n", + "display(result_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8fc25be2-0439-45c7-bf40-f4ce5c88a4b7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ "print(f\"Demo finished\")\n", "\n", "# Record the end time\n", From 6b7798df908577622c7283694e411792013388e5 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 18 Jun 2024 00:30:25 +0000 Subject: [PATCH 15/83] Formatted --- ...ustom Embeddings and Approximate KNN.ipynb | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 95a15f4692..27bdd213af 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -60,17 +60,12 @@ "outputs": [], "source": [ "import warnings\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module='tritonclient.grpc')\n", + "\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"tritonclient.grpc\")\n", "import logging\n", - "logging.getLogger('py4j').setLevel(logging.ERROR)\n", "\n", - "import sys\n", - "# Assuming 'deep-learning' is at the root of your Databricks Repo\n", - "repo_path = '/Workspace/Repos/aspiridonov@nvidia.com/SynapseML-db1/deep-learning/src/main/python'\n", - "if repo_path not in sys.path:\n", - " sys.path.append(repo_path)\n", + "logging.getLogger(\"py4j\").setLevel(logging.ERROR)\n", "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", - "\n", "import mlflow\n", "import pyspark.sql.functions as F\n", "from pyspark.sql.functions import rand\n", @@ -86,10 +81,12 @@ ")\n", "import pandas as pd\n", "from pyspark.sql.functions import explode, col, arrays_zip\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")\n", "\n", - "from spark_rapids_ml.knn import ApproximateNearestNeighbors, ApproximateNearestNeighborsModel\n", - "\n", - "logging.getLogger('sentence_transformers.SentenceTransformer').setLevel(logging.ERROR)\n", + "logging.getLogger(\"sentence_transformers.SentenceTransformer\").setLevel(logging.ERROR)\n", "mlflow.autolog(disable=True)\n", "\n", "# Record the start time\n", From dff46fe1420b84eb065e23920f6e53aae919620a Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 18 Jun 2024 18:56:53 +0000 Subject: [PATCH 16/83] Runtime flag update and load class from file (not from synapse.ml..) --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 4 ++ ...ustom Embeddings and Approximate KNN.ipynb | 48 +++++++++++-------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 789abecac0..13e736c9e3 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -299,6 +299,10 @@ def _predict_batch_fn(self): nav.load_optimized() else: model = SentenceTransformer(modelName).eval() + if runtime == "cuda": + model = model.cuda() + else: + model = model.to("cpu") def predict(inputs): """ diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 27bdd213af..d413ea4f55 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -65,7 +65,15 @@ "import logging\n", "\n", "logging.getLogger(\"py4j\").setLevel(logging.ERROR)\n", + "\n", + "# import sys\n", + "# # Assuming 'deep-learning' is at the root of your Databricks Repo\n", + "# repo_path = '/Workspace/Repos/aspiridonov@nvidia.com/SynapseML-db1/deep-learning/src/main/python'\n", + "# if repo_path not in sys.path:\n", + "# sys.path.append(repo_path)\n", "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", + "\n", + "\n", "import mlflow\n", "import pyspark.sql.functions as F\n", "from pyspark.sql.functions import rand\n", @@ -181,26 +189,6 @@ "print(f\"Loaded: {number_of_input_rows} rows\")" ] }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "36a33406-ca31-42f2-b646-fd50311c7cb4", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "display(df)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -303,6 +291,26 @@ "query_embeddings = queryTransformer.transform(qDf).cache()" ] }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "72e9bffe-e24e-4cfb-b99d-b4d8832bf955", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "display(query_embeddings)" + ] + }, { "cell_type": "markdown", "metadata": { From 5f569cecc92bad24350435bdfed8b521e94f5d46 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 18 Jun 2024 18:59:59 +0000 Subject: [PATCH 17/83] Use built synapse.ml package instead of file --- ...ustom Embeddings and Approximate KNN.ipynb | 52 ++++--------------- 1 file changed, 11 insertions(+), 41 deletions(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index d413ea4f55..e5bb4747a5 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", "showTitle": false, @@ -26,10 +23,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", "showTitle": false, @@ -66,16 +60,11 @@ "\n", "logging.getLogger(\"py4j\").setLevel(logging.ERROR)\n", "\n", - "# import sys\n", - "# # Assuming 'deep-learning' is at the root of your Databricks Repo\n", - "# repo_path = '/Workspace/Repos/aspiridonov@nvidia.com/SynapseML-db1/deep-learning/src/main/python'\n", - "# if repo_path not in sys.path:\n", - "# sys.path.append(repo_path)\n", - "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", - "\n", - "\n", "import mlflow\n", "import pyspark.sql.functions as F\n", + "import pandas as pd\n", + "\n", + "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "from pyspark.sql.functions import rand\n", "from pyspark.sql.functions import monotonically_increasing_id\n", "from datetime import datetime\n", @@ -87,7 +76,6 @@ " ArrayType,\n", " FloatType,\n", ")\n", - "import pandas as pd\n", "from pyspark.sql.functions import explode, col, arrays_zip\n", "from spark_rapids_ml.knn import (\n", " ApproximateNearestNeighbors,\n", @@ -107,10 +95,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, @@ -193,10 +178,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", "showTitle": false, @@ -236,10 +218,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6885033f-6eea-4338-a632-2837582d91a1", "showTitle": false, @@ -315,10 +294,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0154ce06-5875-4236-8178-030d45091445", "showTitle": false, @@ -358,10 +334,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb", "showTitle": false, @@ -398,10 +371,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48", "showTitle": false, From 309864508846606dbcaa5862f99f09258eb32083 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 27 Jun 2024 23:57:52 +0000 Subject: [PATCH 18/83] Clean imports and the class --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 227 +++--------------- ...ustom Embeddings and Approximate KNN.ipynb | 179 +++++++------- tools/images/comparison.png | Bin 0 -> 47137 bytes tools/init_scripts/init-rapidsml-cuda-11.8.sh | 2 +- 4 files changed, 118 insertions(+), 290 deletions(-) create mode 100644 tools/images/comparison.png diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 13e736c9e3..91e4417671 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -12,37 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Import necessary libraries -import numpy as np import torch -import pyspark.sql.functions as F import tensorrt as trt -import logging -import warnings -import sys -import datetime -import pytz -from tqdm import tqdm, trange -from numpy import ndarray -from torch import Tensor -from typing import List, Union - import model_navigator as nav from sentence_transformers import SentenceTransformer -from sentence_transformers.util import batch_to_device from pyspark.ml.functions import predict_batch_udf -from faker import Faker - from pyspark.ml import Transformer from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params -from pyspark.sql.functions import col, struct, rand from pyspark.sql.types import ( - StructType, - StructField, - IntegerType, - StringType, - ArrayType, - FloatType, + ArrayType, + FloatType, ) class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): @@ -50,131 +29,24 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): Custom transformer that extends PySpark's Transformer class to perform sentence embedding using a model with optional TensorRT acceleration. """ + NUM_OPT_ROWS = 100 # Constant for number of rows taken for model optimization + BATCH_SIZE_DEFAULT = 64 # Define additional parameters runtime = Param( Params._dummy(), "runtime", - "Specifies the runtime environment: cpu, cuda, or tensorrt", + "Specifies the runtime environment: cpu, cuda, onnxrt, or tensorrt", ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") - class _SentenceTransformerNavigator(SentenceTransformer): - """ - Inner class extending SentenceTransformer to override the encode method - with additional functionality and optimizations (mainly to eliminate RecursiveErrors). - """ - - def encode( - self, - sentences: Union[str, List[str]], - batch_size: int = 64, - sentence_length: int = 512, - show_progress_bar: bool = False, - output_value: str = "sentence_embedding", - convert_to_numpy: bool = True, - convert_to_tensor: bool = False, - device: str = None, - normalize_embeddings: bool = False, - ) -> Union[List[Tensor], ndarray, Tensor]: - """ - Encode sentences into embeddings with optional configurations. - """ - self.eval() - show_progress_bar = ( - show_progress_bar if show_progress_bar is not None else True - ) - convert_to_numpy = convert_to_numpy and not convert_to_tensor - output_value = output_value or "sentence_embedding" - - # Handle input as a list of sentences - input_was_string = isinstance(sentences, str) or not hasattr( - sentences, "__len__" - ) - if input_was_string: - sentences = [sentences] - - # Determine the device to use for computation - device = device or self._target_device - self.to(device) - - # Initialize list for embeddings - all_embeddings = [] - length_sorted_idx = np.argsort( - [-self._text_length(sen) for sen in sentences] - ) - sentences_sorted = [sentences[idx] for idx in length_sorted_idx] - - # Process sentences in batches - for start_index in trange( - 0, - len(sentences), - batch_size, - desc="Batches", - disable=not show_progress_bar, - ): - sentences_batch = sentences_sorted[ - start_index : start_index + batch_size - ] - features = self.tokenize(sentences_batch) - features = batch_to_device(features, device) - - # Perform forward pass and gather embeddings - with torch.no_grad(): - out_features = self(features) - - if output_value == "token_embeddings": - embeddings = [] - for token_emb, attention in zip( - out_features[output_value], out_features["attention_mask"] - ): - last_mask_id = len(attention) - 1 - while ( - last_mask_id > 0 and attention[last_mask_id].item() == 0 - ): - last_mask_id -= 1 - embeddings.append(token_emb[0 : last_mask_id + 1]) - elif output_value is None: - embeddings = [] - for sent_idx in range(len(out_features["sentence_embedding"])): - row = { - name: out_features[name][sent_idx] - for name in out_features - } - embeddings.append(row) - else: - embeddings = out_features[output_value] - embeddings = embeddings.detach() - if normalize_embeddings: - embeddings = torch.nn.functional.normalize( - embeddings, p=2, dim=1 - ) - if convert_to_numpy: - embeddings = embeddings.cpu() - - all_embeddings.extend(embeddings) - - # Restore original order of sentences - all_embeddings = [ - all_embeddings[idx] for idx in np.argsort(length_sorted_idx) - ] - if convert_to_tensor: - all_embeddings = torch.stack(all_embeddings) - elif convert_to_numpy: - all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) - - if input_was_string: - all_embeddings = all_embeddings[0] - - return all_embeddings - def __init__( self, inputCol=None, outputCol=None, runtime=None, - batchSize=16, + batchSize=None, modelName=None, ): """ @@ -183,16 +55,17 @@ def __init__( super(HuggingFaceSentenceEmbedder, self).__init__() self._setDefault( runtime="cpu", - modelName=modelName, - batchSize=16, + batchSize=self.BATCH_SIZE_DEFAULT, ) self._set( inputCol=inputCol, outputCol=outputCol, runtime=runtime, - batchSize=batchSize, + batchSize=batchSize if batchSize is not None else self.BATCH_SIZE_DEFAULT, modelName=modelName, ) + self.optData = None + self.model = None # Setter method for batchSize def setBatchSize(self, value): @@ -210,6 +83,7 @@ def setRuntime(self, value): Sets the runtime environment for the model. Supported values: 'cpu', 'cuda', 'tensorrt' """ + # if value not in ["cpu", "cuda", "onnxrt", "tensorrt"]: if value not in ["cpu", "cuda", "tensorrt"]: raise ValueError( "Invalid runtime specified. Choose from 'cpu', 'cuda', 'tensorrt'" @@ -244,66 +118,41 @@ def _optimize(self, model): ], ) - def _gen_size_chunk(): - """ - Generate chunks of different batch sizes and sentence lengths. - """ - for batch_size in [64]: - for sentence_length in [20, 300, 512]: - yield (batch_size, sentence_length) - - def _get_dataloader(repeat_times: int = 2): - """ - Create a data loader with synthetic data using Faker. - """ - faker = Faker() - i = 0 - for batch_size, chunk_size in _gen_size_chunk(): - for _ in range(repeat_times): - yield ( - i, - ( - [ - " ".join(faker.words(chunk_size)) - for _ in range(batch_size) - ], - {"show_progress_bar": False}, - ), - ) - i += 1 - - total_batches = len(list(_gen_size_chunk())) - func = lambda x, **kwargs: self._SentenceTransformerNavigator.encode( - model, x, **kwargs - ) + def _get_dataloader(): + input_data = self.optData + return [(0, (input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}))] + nav.optimize( - func, dataloader=tqdm(_get_dataloader(), total=total_batches), config=conf - ) + model.encode, + dataloader=_get_dataloader(), + config=conf + ) def _predict_batch_fn(self): """ Create and return a function for batch prediction. - """ + """ runtime = self.getRuntime() - if "model" not in globals(): + if self.model == None: global model modelName = self.getModelName() - if runtime == "tensorrt": + + model = SentenceTransformer(modelName, device="cpu" if runtime == "cpu" else "cuda").eval() + + if runtime in ("tensorrt"): + # this forces navigator to use specific runtime + nav.inplace_config.strategy = nav.SelectedRuntimeStrategy("trt-fp16", "TensorRT") + moduleName = modelName.split("/")[1] - model = self._SentenceTransformerNavigator(modelName).eval() - model = nav.Module(model, name=moduleName) + model = nav.Module(model, name=moduleName, forward_func="forward") try: nav.load_optimized() except Exception: self._optimize(model) nav.load_optimized() - else: - model = SentenceTransformer(modelName).eval() - if runtime == "cuda": - model = model.cuda() - else: - model = model.to("cpu") + self.model = model + def predict(inputs): """ Predict method to encode inputs using the model. @@ -325,6 +174,9 @@ def _transform(self, dataset, spark): input_col = self.getInputCol() output_col = self.getOutputCol() + df = dataset.take(self.NUM_OPT_ROWS) + self.optData = [row[input_col] for row in df] + encode = predict_batch_udf( self._predict_batch_fn, return_type=ArrayType(FloatType()), @@ -337,15 +189,4 @@ def transform(self, dataset, spark=None): Public method to transform the dataset. """ return self._transform(dataset, spark) - - def copy(self, extra=None): - """ - Create a copy of the transformer. - """ - return self._defaultCopy(extra) - -# Example usage: -# data = input data frame -# transformer = EmbeddingTransformer(inputCol="combined", outputCol="embeddings") -# result = transformer.transform(data) # result.show() \ No newline at end of file diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index e5bb4747a5..c1a3286bd1 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -53,42 +53,18 @@ }, "outputs": [], "source": [ - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\", category=UserWarning, module=\"tritonclient.grpc\")\n", - "import logging\n", - "\n", - "logging.getLogger(\"py4j\").setLevel(logging.ERROR)\n", - "\n", - "import mlflow\n", "import pyspark.sql.functions as F\n", - "import pandas as pd\n", - "\n", - "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", - "from pyspark.sql.functions import rand\n", - "from pyspark.sql.functions import monotonically_increasing_id\n", - "from datetime import datetime\n", + "from HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "from pyspark.sql.types import (\n", - " StructType,\n", - " StructField,\n", - " IntegerType,\n", - " StringType,\n", - " ArrayType,\n", - " FloatType,\n", + " StructType,\n", + " StructField,\n", + " IntegerType,\n", + " StringType\n", ")\n", - "from pyspark.sql.functions import explode, col, arrays_zip\n", "from spark_rapids_ml.knn import (\n", " ApproximateNearestNeighbors,\n", " ApproximateNearestNeighborsModel,\n", - ")\n", - "\n", - "logging.getLogger(\"sentence_transformers.SentenceTransformer\").setLevel(logging.ERROR)\n", - "mlflow.autolog(disable=True)\n", - "\n", - "# Record the start time\n", - "start_time = datetime.now()\n", - "\n", - "print(f\"Demo started\")" + ")" ] }, { @@ -125,6 +101,9 @@ "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c", "showTitle": false, "title": "" + }, + "jupyter": { + "outputs_hidden": true } }, "outputs": [], @@ -135,16 +114,15 @@ " file_path\n", ")\n", "df = df.withColumn(\n", - " \"combined\",\n", + " \"data\",\n", " F.format_string(\n", " \"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)\n", " ),\n", ")\n", "\n", "# Size of DF\n", - "number_of_input_rows = 999\n", - "# Shuffle the DataFrame with a fixed seed\n", - "seed = 42\n", + "number_of_input_rows = 100\n", + "\n", "\n", "# Check if the row count is less than 10\n", "if number_of_input_rows <= 0 or number_of_input_rows >= 1000000:\n", @@ -154,22 +132,26 @@ "\n", " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", " cross_joined_df = df.crossJoin(\n", - " df.withColumnRenamed(\"combined\", \"combined_\")\n", + " df.withColumnRenamed(\"data\", \"data_\")\n", " )\n", "\n", " # Create a new column 'result_vector' by concatenating the two source vectors\n", " tmp_df = cross_joined_df.withColumn(\n", " \"result_vector\",\n", - " F.concat(F.col(\"combined\"), F.lit(\". \\n\"), F.col(\"combined_\")),\n", + " F.concat(F.col(\"data\"), F.lit(\". \\n\"), F.col(\"data_\")),\n", " )\n", "\n", " # Select only the necessary columns and show the result\n", " tmp_df = tmp_df.select(\"result_vector\")\n", - " df = tmp_df.withColumnRenamed(\"result_vector\", \"combined\").withColumn(\n", - " \"id\", monotonically_increasing_id()\n", - " )\n", "\n", - "df = df.limit(number_of_input_rows).orderBy(rand(seed)).repartition(10).cache()\n", + " # Shuffle the DataFrame with a fixed seed to have close strings spreaded\n", + " seed = 42\n", + "\n", + " df = tmp_df.withColumnRenamed(\"result_vector\", \"data\").withColumn(\n", + " \"id\", F.monotonically_increasing_id()).orderBy(F.rand(seed))\n", + " \n", + "\n", + "df = df.limit(number_of_input_rows).repartition(10).cache()\n", "\n", "print(f\"Loaded: {number_of_input_rows} rows\")" ] @@ -208,10 +190,11 @@ }, "outputs": [], "source": [ - "# dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", - "dataTransformer = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "# To create embedder with different models, uncomment the following line\n", + "# embedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "embedder = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", "\n", - "all_embeddings = dataTransformer.transform(df).cache()" + "embeddings = embedder.transform(df).select(\"id\", \"embeddings\").cache()" ] }, { @@ -248,46 +231,23 @@ }, "outputs": [], "source": [ - "\n", "# Sample query\n", "queries = [\"desserts\", \"disgusting\"]\n", "ids = [1, 2]\n", "\n", - "# Combine the data into a list of tuples\n", - "data = list(zip(ids, queries))\n", - "\n", - "# Define the schema for the DataFrame\n", - "schema = StructType([\n", - " StructField(\"id\", IntegerType(), nullable=False),\n", - " StructField(\"query\", StringType(), nullable=False)\n", - "])\n", - "\n", - "# Create the DataFrame\n", - "qDf = spark.createDataFrame(data, schema)\n", + "# Create DataFrame directly from the data and schema\n", + "qDf = spark.createDataFrame(\n", + " list(zip(ids, queries)),\n", + " StructType([\n", + " StructField(\"id\", IntegerType(), nullable=False),\n", + " StructField(\"data\", StringType(), nullable=False)\n", + " ])\n", + ")\n", "\n", - "# queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", - "queryTransformer = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"query\", outputCol=\"embeddings\", runtime=\"cpu\")\n", - "query_embeddings = queryTransformer.transform(qDf).cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "72e9bffe-e24e-4cfb-b99d-b4d8832bf955", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "display(query_embeddings)" + "# # To create embedder with different models, uncomment the following line\n", + "# # queryEmbedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"cpu\")\n", + "queryEmbedder = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"cpu\")\n", + "query_embeddings = queryEmbedder.transform(qDf).select(\"id\", \"embeddings\").cache()" ] }, { @@ -324,10 +284,10 @@ }, "outputs": [], "source": [ - "rapids_knn = ApproximateNearestNeighbors(k=5)\n", - "rapids_knn.setInputCol(\"embeddings\").setIdCol(\"id\")\n", - "\n", - "rapids_knn_model = rapids_knn.fit(all_embeddings.select(\"id\", \"embeddings\"))" + "rapids_knn_model = (ApproximateNearestNeighbors(k=5)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings))" ] }, { @@ -344,7 +304,7 @@ "source": [ "## Step 6: Find top k Nearest Neighbors\n", "\n", - "We will use fast ANN IVFFlat algorithm from Rapids" + "We will use fast ANN [IVFFlat algorithm](https://developer.nvidia.com/blog/accelerated-vector-search-approximating-with-rapids-raft-ivf-flat/) from Rapids" ] }, { @@ -364,7 +324,7 @@ }, "outputs": [], "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings.select(\"id\", \"embeddings\"))" + "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)" ] }, { @@ -399,12 +359,15 @@ }, "outputs": [], "source": [ + "import logging\n", + "logging.getLogger('py4j').setLevel(logging.ERROR)\n", + "\n", "result_df = (\n", " knn_df\n", - " .withColumn(\"zipped\", explode(arrays_zip(col(\"indices\"), col(\"distances\"))))\n", - " .select(col(\"query_id\"), col(\"zipped.indices\").alias(\"id\"), col(\"zipped.distances\").alias(\"distance\"))\n", + " .withColumn(\"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\"))))\n", + " .select(F.col(\"query_id\"), F.col(\"zipped.indices\").alias(\"id\"), F.col(\"zipped.distances\").alias(\"distance\"))\n", " .join(df, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"combined\", \"distance\")\n", + " .select(\"query_id\", \"id\", \"data\", \"distance\")\n", ")\n", "\n", "display(result_df)" @@ -420,24 +383,48 @@ "rowLimit": 10000 }, "inputWidgets": {}, - "nuid": "8fc25be2-0439-45c7-bf40-f4ce5c88a4b7", + "nuid": "1d6a8c06-8761-4652-808b-703f0c365f3d", "showTitle": false, "title": "" } }, "outputs": [], "source": [ - "print(f\"Demo finished\")\n", + "import urllib3\n", + "import transformers\n", + "import model_navigator\n", + "import tensorrt\n", + "import torch\n", + "\n", + "print(urllib3.__version__)\n", + "print(transformers.__version__)\n", + "print(model_navigator.__version__)\n", + "print(tensorrt.__version__)\n", + "print(torch.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Results\n", + "\n", + "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN, this approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", + "\n", + "Test Results on 10 T4 GPU nodes the approaches:\n", "\n", - "# Record the end time\n", - "end_time = datetime.now()\n", + "![Sample Image](/files/tables/comparison.png)\n", "\n", - "# Calculate the duration\n", - "duration = end_time - start_time\n", "\n", - "# Optionally, display the duration in seconds\n", - "duration_in_seconds = duration.total_seconds()\n", - "print(f\"Application duration: {duration_in_seconds:.2f} seconds\")" + "\n" ] } ], diff --git a/tools/images/comparison.png b/tools/images/comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..ef6756276708ba74e5070e2edd006a04e110eef9 GIT binary patch literal 47137 zcmdqJ2UJtt*7qCrQG7&1ks=_V6j5nP@2E%<0Trbesi7qFUR4AH1Ox=61?eKaCDfqO zYv_;!2)!6Ws0l4_H+s%_&pqcn;c(pudF@GfBt5|o@lDjoMSl$ z0)c4MR3APAflf7nKqq(3o&k=SHu9_hPbXZTsXPD`_g-BFUi<;RuW=s)DvPE*Fgp#r z{?kcS-xUO64Fp2K)gIn|?q#wvao1%-di2|R6N0+E8#<87)iP*(`n;|7 z80+VtHw;h4jKx7p23K@Gc-PVQM$Ct~$)sj`%q7H> zVW^*!n!J<%eqq!uW1lT4bo~Jc^h!j9^~|@Ly&>icydyQ6#=%x*bbFZZhBwj!v=I5% z!?)s*k{RVAzVB26We0Gu8XWCRFi)vKL}u|ji6|!E|FiT8T0Mi%`o7>|LRuKy^9B;` zd;5oUKGNe^7fZWVf5-`^Vx2TInY%-NMd$MnbOO@FL5RLP@liXGA<8e$34=h@kdb8X z4Mpbh275#B6h?+|l6Xd#K%P|vy?UhQ9o^M2l>wkcRu4*A1i^v;U76@9nmK%;xt zdu8LuT+IVa-ldUwOaJFp?a$2-4svQa>x9nA$RSLvM5AxJ&!(+$ZxeS~NA1KC(T5iY zZz+!2-_vIis)VK{%GlM~BFi}VY3fLZUt%`1uy7&BTbaq^ZU2iuH{48XDH$BOIJ8lt zzf5Bz|J-O@hept&fvat&KvVUyKfOz^Wx1M$IDK{A5r>iaT#+GNF0zSMV0v%#s%+w8 z@`Rs`+M&_@3+xwFqs?LO$+^+p7rt#sE2eu6+Z8ce*(24aE$%qn4kgp32~>TBlo!wV zl-`@}VZV7{iF}If7|HuH#~-kVFs6qQNdd(Wv{@Oz~9?qEI4 z%oY4fVsp-2ED8ilrIyiRkkYMlole++%a+a?_693qltY^ZTwoGP7#R|Eo?Tt`@r7|0 zi>%x(mP^yHh<}1SDN;bG`9B}$#y#GCb3wI0Q54WHCu9v*o#vgcIV$lBS>}9QP&D1*h=ty0S&RPJ3gt-sc$rk0e{Q4RX3#HzcA&7gzJU^aBG$7WB2d_;F-6H+06ka?>p}|?pzPn=9Ijb!TO-bcg`LfQf4EiR;YUjK7)H)r;4_QR) zhX_q1Y4Ao-H4|_#o&uZM7FJ~hAx%A5O0xi;q5h(`g+R&yDJe{DLLW8XR-6dzjRR^vnyEE~~n|J#l;-=joAV z=7~}kK!TedC^kK2_s+a_e0<7M&^%f6cm&FGBMoN%`eR8kX~0PWPM|YGHi*jie0KOt@|E?vwtd}n&@v*=^~VvZxIWkh4@+Ya7zhA?Un=O)kz*@^dmcsHx zq)^jzkUIH^;t|0Ce|Vb?a8VzFNEZlfpgRSlYPGV~Qj z=u8^hdWLdtTbH{bYy&w7_tj08nJ%-*eb-($nN2#Mjx6e*UqkW2- zB{}alrY0n%KV|hM>(d@l(Z}-!aqDzKPn2dD&9X|Cnd>DibH6ekP<)~SI zYctpM;mVi~0q|Xa)4)5F?f{{xp^}owaywnF~=6oJG4j1!dI== zSI@+VI#gvIw7M};tRY;IbJ}<8eLZL7-(Q9w8 zh`stg{)GaqpdJaC4X)F`E|H9>qOxg;x7W^`?u+!GVT^gaZ8H2lvXU*5jA5v^v>h<;av@ zJ_gK{mtJ)^JWmmm$(1-7wpw3kb(&ZgUzVTnI4ClkRxmAzdaZHUl_rAYO4%9E=d&1@ z#7NSGSK9jc^N!{B+cRMN%HJO+2n|}=-t|BQKV~hz;x_(go5rpFsVZr;_p@Unv{^QA zgGK8x+57SL@mf|{LHI)Ht(AbJR`>dQ3}L<(R=2Khj#FDY+**-Szxc=AKdI0a1*DBFfeui^_eXn z7#nn1f#v3j>6iOK)WjXsO`$IF@hkx4Frl zgdbrkm`%3X`-TTtU5%27(HJeo1GjHIc_$OB3T3$bhC2P39ZPbWDQd5K_bA-Cs^h}l zwHnJ>j$4bP9CKz``SnSp9CR9wI$crVKwV>E-M43XRK{8kWU`#UBX10UgGaQ6o_hMH zBsOY8$2+y7A{wM-VCDg=|Je1}(b3WEUb9M~tiCbe=;h1$!DWj|{pJfZa{{8x&J8b& zP?wEWKKQXj%tnsR@LOAueb@J(32OYta6W!fe{X7I?|=!$lNu#OM|$ua1M1=;5gOPT zjfYq7s(1zkuoI~wDn6m*$BI?Q6|38B|MYwh3%pL>jNtV0^D+xT-r6%|HNv7)WE zM}Ai!yLy3qSMWaWM|t=vi!uQs2()Gh7)6M>?JvtX{Gy_FZFo37!Rpu*fWDd$PgVXJ z?L;h{KXp919ol>6`1!NT|Hc8ZJ=m-GdSP$}-&FaBKxc>cxNqK>keS#RU@Ul7%f;%- z@hP52xrO}lQSpl4-v{J4I5_$_!#USmV~`Wf)~{^}KmAcrMNe&S8KL32EdfA@!}MdaUWv6krkS2O*$ssFbQyjHlKYX4;z zNZ5cA?*!Qpr)1zBAp(56RZlFX!kCQi@LdxVxQ1kirAEU#1_oA4pC_X_OgkH2ycm2_ zm%7tYw>e{fA}!QoK<=tHoupQ!X8;o+jW|8-vTH*;K&I)2IIg7c1p;xgfwO*a9rJ^u zgWDAj+Lnw;d$->)8NKqD7?2&(%~P+iTH2DFr%q$|ad4MEc^Nfad{BVawL7$grVL(O zH05huT>RA8vJo&k@9*z?zwkcOmP5|uDz|!X*FEYbs8-j4QMAg5x()7EI3?dja^K{x zJVvI{v-4RzNpsqEK*w3hz}b6JFEjW|kDsjCW zZYQbQUzG7MZ*yf-t+0>pz~!KIGtCU`)06mHCk+l8SRc+vz^IvgQgw*VoU}fW-tBB_ z?{-#&_PMgWlERu3Q$m0pPx|HXQvn0AIGs&CZ+LoW@R9O@!8ey5HIV~WSq0*3*6bJ? zW4Og8SE;%{YJkHZjx&bi1m099^YwCftWBZ-R(wXnbnK>nb$E)F;vrel*r7xqO;DiY z=Inwf`+hDZp=4Qo)V8rPHITihvn0^m*-gJba=pAHS9MNBx=|$UOO6@6o8_S;_}#<> zG~9xeB*m?85R#j)HzoI{k$Pp~>=oVFsy!IX@CNY=N>YG+-(nT!e_ z;$bVoud0X}?0w`v@r0++a|V|O#1)~WUgs5r4KXzgFiAFpj}@=o->{QGv8wbxP&Ci@ z&Wg$`TjU{{zb^hEU})L3^5Tw(U%36lQG6H;1`S@%%j65Asv7aGm5oX~Y6zy;M}Iag z=bk**PxuqU7U7jLu)Q$LyzehJCkV@#7?}?+93<9=mb67n@a_9kyvc^OGnO6FhIu(n znrt?*iP`+p4(ed+% zkj6QE7VFh&tMs}pJ6Km9!bV=EQn16-7QK$Xp^nz|ru&AH$%YI}hA~KinGJafX-%6= zcOm;WL_Z1}S&G00EErhDGX&aZW9lQI*Q!Px#^p^_NfQ8 zVu(H00pG?D^K3B(u$E|%kpQ@H-EDStLxY&rxFbTzx$caOy*{Faaj1 zD;=MWfRmP*lgEz3hLAYPtt*Zde8iPUZSto6V4QrZ8ziAbv1`FoFWDpB-c?mzAQ`pd z#b%H=#~^jr;E^C<4)px{b*sUjY#13VK^n}g4!9ix7AXKM*oda@A|434!qDr|z(w!l z329k*YWT3JK{?U1mIo}+;_BC7oonUv%aOUy3MPsLTNay^kk>cCDWS+>?>r^GMTL|~ zc3f1$fGm#PGSzpoNRJ0>tiz4Mt0TqL-(xI863lkwW}P<#odJ z16<7?>CHih2Y^@BBWU99V&SN=-Dx^Dm-ku?fe{-fI8RNKEv2AqAHTTau8KZj)EJbH zFf%+@#6!}T4^tqIPv1^PMHQ*n^r^>Wjh@7Qy1dP<#~2j650M;7=za14*Qt^a9mts;Ia&Dq z%;)gq2zK`zA?*qwjSaYiBI7(lGy>+bi%NW9BMz*zpCp5)if;*N7svk4RTCiO;`J^585542 zxq8=kF*?m%OcNslxkBo@#D&GK#7KE$-Na9iwfjcMa+MtJ-LJr2t`+Q8e%Jv$3j)6< zMP)Qp&=AtDVq^^aC#5)e3FooX_1nBIOk+H|l7m>A)WH4Qn_({?=o5&%Y_#?)(&y`x z;BsYRPhO6Idkbj`(TA8%R{?2toEts3#b@NI5hAzSlgpDDyWqY&_UFt~)6|_m?&8H~ z8?TES1Y*qfu{t^dlBcn)whA9tyC2mPwk4UO>+O?x<(^wv6fsxg)g!RieE9xzlY50%C ziLpE_8|b_4VZIVR`=XKO?uqGX+&?yB@|B+Em7egd23O^9`FmZfEsX0c$@?RBCcPdc zl~JX7f8RTGb7dL2n^y}hmS_mWIxO`D3I%diQ^MsK1dilTgTyES&D0QM9L|XSy?{U3 zXGPf;>(ftpNhJ8%vWVe-{%q?Yg0WVOP^_gyA$I?uXTf9Bo5fKCQm3YHBx8*WkbIb& zD>@e_Lg8=hRfG#@fLRtl_%DnUbs;-xab~t-@qy5~m5e`jalHdCZ4VCE$U}-klQ*an z+Q7A!b_t?NQDr%^ef4pe$Dui%uIW(r#h%h#P?#heetFWbb85j^J{uAc%9akLtu4i| z;vK0hG)fi{w>&-N+$&Y(yGTu|wcIF@>)~sD)3PCB%qgB%x5sqFCGg0ed%*f!Af%yQ zlX`(HCN(tKJ*1?#*v-@uLO)f*_(-BWCe*9>ay9)NuFIM$)5UOb7R*((=y?F)ugciv z(=-_s*gtn1JbctJ65b$M0ZYuT8GyP2@(Y#gFJE3)F^Re&_dVV48fB8V~Nbj1Fq8@de?l-IJM>FFecwcWm zL^tzr0b?U`*j(pd#GpR%^75HV^hz_4T>f<;-6#9??*3_QIGb&DmlXS{S>uv~jB)7OM2L8(~J))(%u!uOuuk8r+;%(liyD5T1JW zAHq~&4VL5U`}Z(a%8>Z`(x-%G|7xcH@51L<^=Tu;qTG@#pf>Q0 zxLKasOuZ%`R%C2%P^0Nr@UjcsR+8I>3eaBgxk|oCM+UASCRs>K=g)rw- zPFq!DbJ)1I)lTbS{?Vd|ZVNKRfh8*cUY3LQ8~u;J4HzA`c^Yf?4P+6 zseIFO=`YQEG}m2R66|vt2RnU8;)fm6`7u zS;U2hug43#!i^4<%U<=!nHau&pW+tDQox02Lh&Y!l5@su4oUN(pFz)=2y9b+w(CPK zWAVB8ZcJUZu?V}{ypl|ovu>5q;)js*2jJ>=DPAAB6f#*tNP!A4={vN#+6*2VU6p(~ zVhXU_S)`+OSb})|OXDw>HnZSxWX<6Syc^-4hRx%p4@=SXEHAb9-Pr#D4P2GEj2qbZ z#*TZs*nV2_&`4M(IUGi5^q1Ne+Zve;$|PxVSKhi}I6`wE+LI}qj8vV)L@Npe6)b<{xRQ1K_m2o;pLQ7hbY{$fUL>q(FTNC_a`XUFzd!s zK^jNTQ%R)#Ri=>s$JvunTm{O3oN4&{R>SJ*ri`5V^blRL)Ed^L(#YE&;OCJ+2OX$d zw$c+wYW&bsz5WUxtMfCGoP(^@)To|~e~X~$gE0H;MU@OVN0HypT*~R z9`t=g#1Mk2?_l5!KjH37@aS^UG5L#FLByhh{UO-4i=<>Y>q&Uq{PB6}>wHp(A#7p7 zDC-PJ@B|@kwr}#gmKs=KT->hQroo|s*6YhW51W%dO}*P0gAw*^ZQTLn{0C7ZVd*AA zJ@J}YYuV*K>BmT(9AC~3@!FuGF;X9rSgh1EROLVcsePoKYTlfzezr^_m zDV!7(g9E6VztS@bP;eBi#O%#7=3RZ_#viPSv3k`cj$mF5ltpPY<$M-7IL!)GQ@ID} zRF@9^@>r)uxxd@09-4u!$`=`anfKld(SRXk$ki{I-ONalcgHMq+ zmlSF})K_m@H=@o@E5XvJO3KHzL<5kB%D(p*+m`0%*5%SbUH01}J^UM@UkTq;Vgbqj z(njZR$>~N!nZBd*kQsJ`sV1{iqy<13NEboj&?&__#w|v8A@I@Rr-?%HZm>C?h zbDUy^hxojgvn^fF+B%5U(6Z4VM;3|5EO(2|WI5uD1D75EC+?Frc94et$x5!hyZUi% zO#VZDq+hWXYkACaskIiga?5)E`wSJAvC5{P{z<>|`Sz{+^ia(c=Ha3s$Z=bdqMr`fhKTvIbQR*M-g6qgJyXsX7 z$?)zSNg_}iw&{&By$7=>?9*Uru_XOY`TS$c9OY-gE)f*3*0q^3?iuSRH7L8w(A5T; zM+@LF4`F}wYQN}OSg7$e{M>Mv3L~>aBhtfbB1qKR4aeR{&KK7+Iii>7Kw$M8q_P{d zxoPs%yW@UKEz9E8kmkPy9yiI=Cq6$XeTzVZlz8{T%55yZwjF!)zj=49tcIY*6}nBp zq_5*4&)O)1Myn_5sHc5-fjHuoE8y)U6%0$xpRy`{nuSfCz+OD+_(O!cei*j`C{xvL zm7cFQRsJRvNj;AHUyeN>K*)?!+9VJ&1$fs_PQCdMc$C}mEL&$&Y)C%%iK5?JG#F2& z^cA5-OH;>iA>$4X&P`zvo~rU(S|=C7%EQ!^4;y5au(UT)yK@%6g!K7lE%{worLh=o z5zTf|K%wn0PBTDaYa6k+Kj;?*1X1zRtowO~>Ao~8<>>or)Sd#mUN;Q93RQPmq>K}U z+IOoN3!gBC)zR#d9a(%ipVAivzhvY|XGd+6ZTT}_;24(+-*4?Mk2MRo1>loSE84pft)#qtTU2y4nA-)Q$|ePstw=dKHUEg-cr47uc~y znm|sreq0pes%MD|we7g3LRF-=u@UTGXQz)~C^2|Z^+WwhQgc) z=N2EChb=0>mNtz1xZqJq^L2p2>ph3FTNlmhK3R|#jYtZNoQIVym{Gnp;4IU96Hwwv z1kLZ*l3vM|3TLH)FCYa~$r_npomU#WFq#%xnfFB2;Ns6ehgG6xKUzX|k>aB~3u>#( ztsUs4Ws9anB^7v-NtNjPH&NZ`tFxzl>BAy^mc?`dITy>QOo%w`ORf@$EQD6pCmI^< znh_4~Fraw!0Wf}6-UF>_i^`i1DpL5R*t!rdxJ1&+LZP1y=v)WwY`MxL&vb>rKn;_F zOKu^~&!+uuX{R{U+Nk+!oq*Zisw12>)vUUsG?QsyQu%sazlwPIh z+T|bM7~LLlf9Cuu8W6zf;%OBX|3dn~eiq2e^LJfUUdHsp&!27}Lvb&hUGH%b0k z_h!V0U~6XFzE#T=kd?vHcH{FZ>2uX#!~3eohOh{`(r|RH>XAL00$^lcaRLrn zr5@R3U&7ne;w3hPD~vEwsI)rFW*gwr6C_N845XpsCkTUCh%(|frO*AX*7{npE1=Nw znoY!PO$M3O-IAzKy+JYbZ}hbs44?MuOV}ZOqVUFYIzOe!`@{)7bX`0pQQz3=QVb^3 zNfghX25OJMozLOCnO^FdzY)vRT~P?#D@Ut@e?6GggSET^6Px z>sbSFIqPz5b;X>fx22nQ82R`}AM0XtMygsVKco9n+$<|}>W1GiADQj$D{HK(WM!AO z26q&9d(TplrGe>uZQGc*m8+V^U-Ah2#e1iJ9)S$GzepHVIqRBXivie>!Zeh`QFk8iudRZR^8L!1@2DKd7 z+a0U!!^1B&#I31T^l9}ZY6o?7uDViWTHXQ!kZE^oID>L6W}=xq0^T6R_&(oS@TBG+Y(DZb@15p86lD?MJagC11A?J zsT;qh=Pe27zo7@d`hKGU`i@{dJpx%SZpgBQyUpEb|0T8*$wRcn@qZ@i{2D9t3~+-3m3*d&^Qd1LDkAqiQp`P74y zxG8P+J!8pvjb)MICmDfp}X%#&t& z;0- z2TA1F=p8r-kGfk8-Xdv4J826YI5BwA?bY(bdTXT`xKhD`Mgn7tu8|x^KWi1?G{a@~ zmFo)R@5_3cS2awtor`D`K|4*yqT79Uea-Qa$z{97YPm|jR%aDSB1O!5U8RqC@`u&? zeO_D78QVWi@agrBRXTVV^ZL_O7kz4zTkp6fk<&x?5e7O@Ay5gcj<|c7En_VjO$+oi z5l3q!`S|d>EQ4y+hXd9AgVNsqJ`caRZvk0N|C9p*nc#43FaW2qaz$Ch*vyxKyjNDz zm|o=hd3x99+U3TQyCdZF>(~&x()G^K^&q$i`|Rnk(M4~UsvWmV=*>x++uMOfmVbf{ zxURp{^>kPVCty}Bog5a$@**Q^sHbIGl3 z@{$j)icRE{LAF|Rh26Lc)}qr*%(S0u;wx=gVeZNFI%-^NOESzhIEPc`3(fx*Ie>rHA5VE7FW5Zipp#9G(2

_tkgLXYrYJoN(Bx#x9vW5NS_slF4mpylB<%CfVN9>WaUR%TSFc$TW6oP z=o%UxQNRYGE2$%}gVj|Sf?e9~omxjfOq&&yRZyENwQFrvBcv(g{6zci6LzaLTMX7y zuebb`;*r7Qm@{jL{>iZZ$?uM3-#_}=QaV9H83eXEhY7Oc=%vY2{dFc_AG{>LvFZKL zO0juyUH`*mQS0Uw9*Xp2uy_DN4OWu1YtTM*qAt+DFW*NRw_> zQoM{a+`7QsA;fw#_q0{#n7Gl6uGss;r-t^P9kHLp9%y<;q==b+;-r}>$@;ZcrkXM&;Gm| z;YMYVALtWq%(WZXeMZq#j*>R-Th48vEwnKhVwB9kT4wvm&ll?yb2vB~KtR)%yW~Xx zNzXlozDYG8iW4eNo!#Er5eM9*!cqY#N)EJIHr?ga7rV!i!jh%}mTVKwwh4i>qL@sx zwx6zkr(0}d?|T_|QzqbN2KHo|u+yE|BR7VgucTv?YCNYhOpee;JDQsu!~)pq<^uyvF=1O|3_pbVs^=)Z_F$W`*yY_Q#*!XpPA2<4VeffH!OwDFGY^ zvl$DB_jif_?lYo7(dUac!gKt6SR3*@(`u|!x#%RI5l0_z`uq7kOtEG;`Dd)|@Wtjp z<^YDRtCXj@bi%;9_c!C`PxD1YB(dQM{8ZVyF1g+tdR0 zLyY|!_w|pO`hTl1{ksfbQ6&Nid?M=>_+;GVL}KtkgsUQu8D6%!ocJCXyM}qk^8E!% z_H!4=c>y*SpscnCy?3Y|TC{L~f=0%Lw2Y~n(@7|e>oGfbG&WZ`X*x$NsDTuMHcvYX z7}@<@t8MI?%>TV)gUMxvuyp$jw=5a|VYjYV>i;PS5w$(mW&eW!4QcqDAHHgQ|5r2p zcMPs(tat6`klZ*oY;~<+p7Wm`wUt{ZK_6zc_!p zAkeMizXKZo|Cxp$>G!&T+A{wNPE4)9pCA2EGh`>mA$GN`~S~W;>mxd68Qu{Ajzx8g-Os$0M@G( zOm3fA?RlZhc!q^NhfDZpOU&)h`bIPOXDE;iQTy_{de2583Kn@7qkoI|#B8%a} z79^F=oCLjU0*bW#lP6vofp?VfDS+OGZ>qCYGZLVwdO+uRRcP#@CL}z%kwu@zpQaYV zJ`#JySqWj{BQ(-jBK>VWJ}>dwXg5Tn=FOOIyn^|CL35g4a+Wf zH#O#-57n=)yU!hISag8Ads;crq?lu$PM~}n)4sa7eUC2_*{X*N5w5RG_i``Ugrpp2 z(%4To`5Z1mjqlUb@O)A-w_n=NIKK*;C=Xt(%gc}KZjG;?um&C?F^$Jf20ktb3TvCI zw<0TTKmP|G-=&0{kiDQ;C2k8;9LXt$e>l$QmndEAI6WeD6+>SZ$Rh730sycNee-~< z0MLvY?S`4*;oywZl-=5_U<60>OZ3^q$oJ4x{zCKB_9B>2p>U~VYi&=hjx3(xZGPQN zec6=c*+>q{-2fx2sb`y#{pa%HipR@zjf+0y7QG%Z)>?l6$pa@%Pig+_P{R>q3YK_a zx7`b_2Z1_Yg4r=L3_#-cjoqHMFPv{Q_hC~8>vtA@&ZM6mKO99Xm3wn{w_u9{ z)udQYJS5?1g*}&PQeSTLTrLe-4J=yK51$>Q&zrRQimx>;68Vk1>n;X`oA6_rNWmSA><5U(FK-nkr zF94gL^q-hSj{jm3FaKvI@u3x00KPDMbA|oWF_Re2K)yP@EVWR>)IR=f%s1iM*{y~g zPvkbYMT*1DlIDBBtC0((n{>h!IsPcq`dszBOO%~? zf8ba39cCl0GQYBZwyn(f{z?Exk^--KKq&O7P-)vyNnVq(i{WI!$Qut+RlqL%ViGIv z-}x6Nap0es#AnA$;@x!|&F2eAW;a z-M$bsa)vBC64&8lMc8C72kbutH5#`S4GY1V2AR2PX%3fG!<*`C{dQD zS-*TFp1!4znvg+NZj<*q1ac^F&J%1$l-b#`)?0F+lO_MTCtBRcX|}hB<q1rKcBkE{ZD+NQl zDVqfVj2oj*zg%$5#OVaaU48S3X_{b9LdM|iM4k(Y z(dPLdUqd%tALm$EeZ_ZvL{?uLd2-$d@yyXPmS@rG^XVgao>f3`Th^=eUi>#x#ovS? z(s(z~qhmMA*oFO^RvbK>Zy>yDvz1Oj)Mi1w$=?3D*{q308u&y(c?Ns+I_JRbf*S6^ z(`OwI4T8qXF3Z3w?j;)+Jbv<#ks9Fii-xVsulU-}&p2ER!+8lBB~4ZOL|^vqvv4iu zRqs-q+_`@vTF&39=L)DXjMQtD3FMW=E{8UWsiAR`A`pfrT71k&_9wdjToiwl7+H*Q z`{Zqt<&KW(iT);7<+N9GsXoln8IkVvBdn!s1e#?GhhX2~{LHc}r*AvR)9m7C-D?`) zZnc%%S?^N@y;f34hR9qY<2=ft%b~X@U*lm|OTX2+cxx9jnKOoC`JmS=Ga=+@U(9Y? z+I0^ZOKcPxyzt>*Em5>e+*Y8LZW8&kZhj;ml2B(~CF$reNq)vnyy`OkzNO;Q>~Fot zHcK$Fv8MNVz;jB0?5CW&hKu=~2X|Kx_5II(9u@xK2OmOF?y%k`yQgcor3Z^kH^f9` z?jls#hWn+r0xYviwSdC#;yaS&!G5KuU`2ys9ruMw^_iV`$3(yXN&>Oolak?#EOgW~ zlU|IGazfrr)NTw8Xih`xk~ix!+D2TpCR&ibUikI@~8 zXI=QdjEKMPxz1033zD(P0jL~rxqy0oG!c)g!woR-s*j+h1=wIr zOIpp&E7;cr)c7G>7gofJZnH3Uo+4Q==H1jlVi(~dJ)#&7$K2gMC|@~hWs`Gv`U>3N zbRJi(2Qzp-TVHppF{dgTCQ^vW-Xo1geaP9Lb(=bUd$TjzxM2vci!=Pg=Z- zkYRNw!>>{IliF4FP$)T&0HBI0^9)kc$EafVsnj^d&JV-10Jnv8cetA%4)7ibe?#C( ziYQ$N1u`#3?`9g)9UZw9I!;{0Y`->LhsY?Y!!`L z&Cl{~5SMyb-hg_Uk8=~#ns*2H>gLFOf>Nvi5}0z(u4ZUTmMxkPXTRfo?z^$JNv@9n zVdme|DV9qzJ!iusCq$2`#+RN1Vq7FP(_>}iNlsxv3r(!{4|seMM{imh^_%|Zv`dQR zmUW(yd0<}h-c6px4#oGCZ%mO(aEH78`kjFi3Z@Rw=bIA*9YhfA%tnFS8=j65rA=bL z`$uujM-C|<$rBiv5K^yrVjr+bbb$SPr7$TdWeXm=uvnxJQ}<1-y3A?!obTnnxrcHB?zGdY5E%GDk6exTttHfwRwo|57-<) zEfN%v&jfwOA-3{t*5+C~TTNNf!SnftTMOD2tuPNS5rTS!jC#UBDi9|>R>6!LAi33r z4UDUKty;C<-5c3C5~VKv_38=qU5#x+Pgf_F2!mVvpv9I)qQ{@ceyXc)8$8D2PeSbT zc`T0-$7d_Dd4!iokH7sDN7;<-u~WVEFDWPJNn7v(?FsSGl*wkOgxg|cV&kRpfEHMI z!Vzq7*|Uw2SF@>@=qO0rnt+7Jd1N&}8k7upH$fm2)!#s}#eyiD~=%S8s$;&6evs*`1ISCe_QQ+!`C3p2FAJB4{XQM87Uv}~b~K^`#*9_w(E+^*OY?jQ zwd%f4PCIFm;xl@r?L>gh(BRrjV8aSxc#@7;$XsK(wj!UpZ0oe?f&#t?CT7;tUd6!! zk&wCT?CXCBp5A4`j;R)mkBFL%on70Vr)JH9+8`t2M%}xR4@{_#p~7#EtMZUa_1>lq zfXRF1YN_>T<+O;tCggBpcUdbzxZ+kK70d~M>E<&5*MF>{{SGGoC%C=;dzZ^ohB@~s z-l9zc-5%ROuNN$@SKvO@u(!#DH#gAF znLbWPDDh7$k!Z43%u^lDbQ_-GoyQev;a|=B`6BrNDe>3G?q&e=zA)~R%jxYd+srwc zJI|8BO3b8o1CRpcCZ?>awg#JKQtNS7(*KoYRCfMvk&G?1Wnga^1M0d?vzY48@;H;G zbX660cYkU`FTVEy3t~`fTQ#OS<#&*aX15_4o#{Q1nBkQ2`BG6i@T}H z7GGPTC!?PO$M7-JNDvgz64sE;Ge(?}r`O`^w8=1=o#J?gKDpzslYnhP#c$*HpRHD! zqa@tIn+0ZUi_;)AiPoN-0xMW}YmdAIv|h<# z<@oi_d_oxK$hxYQ_^L^Dm45*ot&ai6b1M}1t&^_%#T;6L&NxK=L-myBrGi86cApv= z;*0|A4V5R5y@bKb;msO(GWugxa=ZT1a0PN&nBy{~9=a-1Bl_5_?nalFuS8&5uItXU zCBx|Gmgd6cKcVMZ6WmYL+33Xnf;CJ4I+WAdvyAUNA;Z}0otC6Gv7{s7A<(A#L>eR0 z!OILlMi+WF7bAMtmCXb2^!?i2 z?-RFIC+>}+11m=S==gNFi^~XAk|H#mt8{4IUH|?@0?(iVr@l>fV0QIhk2?BZa36_2asB*>spkl`u({W$Q*Mdi zbdD&E#rmeI)>Smf&L4M31HDnlMG>8t&3c;Eak64q3QxMsP}`neL-H^W+h^xf(&B3s zR!{JmKqcj*V+24Bdlp zG-J)Na6THiu>r$U!8W0nIID_p4VtX|)}NCnS$FH1HQdwE9iNT9 z_!(kUHwyG-T8PcE^6MVDo_f40?$ZW`>arLd@oy|r>aW0C0+A-rM&iCh^3#CID?8!y zj47l86vGcnc(dHc$P5zY31iW6A}#DD&`*wRvXjEdv;bvAPkJd-iM0TKrJtr_tl?QC zactWo#l0pm!J%2Zd~Tzk`S3@nv8mPbl#N);>$l@xlsy%7O>#4|@qk{-V_Erl7Z9hg z@|9Fo_Myu&e3nvc2dm5ZFtv}W2L*sGq3Ea_p_BBGpLSA(1${qD!JlO)OF3P-K( zu5`{8z%@L^_XFN%jiktGPn7^1W>78{1+1>@J0iV`spv=L*74>nqThA79!#?{l80vm zadAA%y3nY}aeHlU2YtvLH(i$8kyDqj_f1^^Sn?47OM)zR3+8PPF)lgg+A-CpWmHXL z{;o~;)qnOll)Ke&I-i(I?Fam95K7Y-<|zes-jnrP@)02W$n``=5>cg+O#h5h@?9xG zAS39FZQeJ{|dEE6Kc-fP_w9Avy*;iieZwU4nNV0EM!6EAo`+_;0yFRw#fxmU~D+EwswDQ47Z9e~gs?3#K?@u&R-MvDV%$ zM(({b`@V1`Z~L6|NJ0rO>G@#l@uXLNO?(YBedvOo0kVpFCDkuv`)ka6!j`lXhe^Hj*>^IHu-K1dOzK~*}D=5(b5-XeoPN?|k|Kk+Thw_63 z{YlK`y^li~&H~^ZZ?D{4&y<6vTjEU%O*P@oJzU3k)v$1-_v$1g(7E`UbU`y?$vYr- zx4CPcjnCBCS5ah`G2s-_F7Xp5zDjuO)+!S%wzgST|997}gGH+{6AUPhaj}NEM6-e( zS)HD82zOm+OvB9-I3*+q|-3bI$1dYr#qnepC>KX+2V3kix#?M3whKvXA8FXSTb}9 z{LgiTr8n=qufO0&YjqZ=G5UHhU>Mh$!q_FC)PBz6fD7bwBhcWg?m{NIY8EnRY*VZ% zG|aCLU0g&S8=cj9VLywbp(EFl*T&-1>IN_Er~RrRGlm)Z#bTsQngc&0>Jj~$be^mm zLQxrL{nh=;>-9U7{HwK-`q$```VGDyLOb%XF~6n1ivWJdR&0R}-8h_6U;G}G2wU^K zNr$-E<>={112sKw#A;FcF8j!EiiG1+`7H zLZK~{6;ldMBToPz3VqYd4<^|I*xv{GkM>sLQ_=;E8zTV#Q4>o6+@)22{{#5^R~_}H zn@jBn9d;DJWS&MgZ4`}@bL84J>GGb8_?!SWMvyKD_7oSuk7{U16$}3tdv5{`_5S~V zt4=y6$tedRoU}M)E4vsf5s9f}-;#AOWZ$WT5VB_*DMrPRZN{#$@5?ZQF_fJ#WE;yE z?)RwE>3o;__y7OD_wRo%*L^P6b{v>Z8n?y5hPYJXAv&C`=`KzKY1yxb zb)f%@pG#^upU#)mcn`wblyP=i6ZY4Q6e`^q#byX<;l3;kQ;)#I@4H0cqG1x&s~6^d zc4?S2r}qzNcYJDv9bK2g8FyckE8)S#CO(4?OurG6tLxi>2JR{CRZlHtX9U#U0ebew zM`zfAR9qgUYW6YqIkaHdp(t;oI!n;!dICuPvyyp?3RPqhKNC|9g(!K9RM2*AF(!`` zhG5|4O5#$=o2Dd3jc6(GsvXsl=<(qXAtQ&{_>)!~S84On8YEsY76^5W8VKM&3x3il z(Xw0j4UwXQjrGAuN?XeMi%mOeqKCN=M-P5|PVvMs-W6WuWWcNFfh?T~vN?rkd-gXN zATqP22b(6_=USsRd<={`d5;|#47I4?cY3I#UtK8hcrsgnkbd|bQ~6@-8Z$gz;g%O_KE4yZge8m|Ixla_n3<&z=v zwhQt@iY3#(x;r{iLKenynAB;eU`Z@6O>2&2Vc22=A0EPCSfU<>r?*RDvE)76=yQek zEaJe9%uLNi<-sn_226h(6ZPcENmQ;~1htPZLwcsk3gw=&UwpfEX)PjoC`v`mEFwG1 z;PXlY%e>oLRijULdbWJSU_D-@D14rf>kwZx#*;a5ikiG*jXE1(c$A@o0enJ2NV&Mt zj#*T2^oo%};o3%}1-NS1zGjZxrl#e!dXPwV$A%zw!35e>*H&tPTUhZMI(Y(&JI#dW zJddp*KNFiK;|!gC5;?HJp?&%Lmhyi)>?kL((~-y}nz~hHo{^G8h@U9w%5QeA_Mvt- zDeCF#Yi^Ab%kLNVl9zNRY-I7~RZ0T{c1>4Bs*{1xI5xN0D9c6sG1q8n^5QDfa#sRh zWI5M5Z%Y?FAjk(oe)_f*?@PFs;Ln>WQcp{M=@E-#sQ@Ohb3H0xN9Fb-t8RRHK={$oZ{7egcjrgkAvDmetnlRr_!4rD_qozM9Lv)W zfTutw*00=~gOx!Q)B#tyv@`I+IFW!rx zunuY1{i@p z1YZoiz;w@pC4eRPN2ZOvdM&7dPl_Ng<8(l}dFdssHC<+b19p-<)CXY53+MgxueWlm zRS*-R@iCUETAeW(74P1>m}$yez*~r=tJ<7=>BRAotsq(IljG|0S3w;&^YHQb!gZ zwRANvthSAXP{hqJUJp00lluxXO^r6@@dua4OJz0-cPgp^XX(F-B8Zm*$a|S-_fBuC z4W@b8M&UOFM6tI3XbRiiQ{^Eq95SAyWuIt8CB(lnBWUO^oqA?-!QBb_PL}*}RB_90 zw!HA{3y(lmE{vmrfMI=V0Sx6Bk$g3UYSi5TwS$l}oSiS9!9)1`+ zMS6H+iw++&hS(c&&b>AHZAM|Sfxjvmi8(1PPT2k-QJ0_g($VJBj%Y0TR>ffQENT5Rfir6HQ5|VnzRX}t+?(R#@xkC^BbR2#p{LW6Wb#~vUH$NC zI=9>Y`v^MF`eC8Kuza4HQKzxDHI(E~a`xHbNWJIl)mACk!%y`@LV~JN@fmh$Wf5rPf+UY(+Lp1aNHda-9^Eqlokc{R~TjIcxh^cd__Kb2j_fp zUFxI5&VrHMlnl&wEAET_G7ZHXRyVzZQPj2{$kjJDjq!J@i`e0f3$bRKOnb zY?bK_`WBWlK+4|3CEn{BbNJ;q=v3tA(&H4k^5Bzf2PlFSaIqF^LJe#&IgcKJ4YNVf z*X>=zas2}ykYr!XyV7jRBmg<5%mLZZ_SAT$oD=q;U(C&|sL002XV>eu3pqqUsinE_ zoW0y~bJUES((2{+LA_DO!|`k5_3q4&hldu1Eo6?j+=L#Y>|EpYO{VCa(-;G}2PORP zVR9un_Hut4JA&2P$`yh-{kCnl!w}{{V_gP;1&U@E*t_@4?r{kZl0Zry7&&9TPjs#4 zV)I??XsOXRgxD#7gfdoeT(!Fd8;a6li`hRzyl&*EpCBWuv!A7c+}9NC_Ukx3Oc(58 z7xgj+HK>l>V5_$Zs=o#ykfe6n=wf zVK`FZG53i@ZNmN#g+ME)+{{hznp+kNbTX%XW?qB3B<+7yaqP>s4v?@xfk% zJVBI_4gVOttuv4Gr}{Z*J;li#_-cqss!$X|WZ@AnZJ+^d*%{MlV}62pwr_GV_hzr^zm+wURft7can@xOM?=!@3^HuGF5SaUAaEm4YF@b*=iJKV&X-5~ z<<&~HpSctKt^d$)=(5u8UUtyqY-_Y>i5YkyT5X|-tiJ>IcIt{G0-0dX zv8ss=MG~hP8+yvR%nRN$p|iq{hnMcviln9hqv}mf23QVEKc640laNuD`WgHSM}tpITrVwpTIh(=+?C| z)XZYaO!(|^MQ$p6mLN?9V#Ti~_kY81?fMBcRgZ@nMVn=;`$W=d8EYozE1pyPG_&$i zZ;~_uHfz@=fIbue!KL=}(4ulZ$MW28@Eml1+g>PPHDI|Jn1)|?(~%aYKm7>j|E*Bq zA4ORI4DA9AgL0r=Y3)EgneD!QQ_bsxOBcUD;!XgGgJ?%P$k2VqL;u0w^>%7(?4$u$ z$_>l(VB~6revM#@y^EIAz^J3QtjVJ#H>k>U38dX%ktmOMgF}IhTrj={@oYHtt9$3zli7`)s<92S=4biGD$sVO&J}_SJ6>2 z4HQWUDLJKLBsm~9Vj6`>4G6e+Q^qup=6|t}s>L>HRNFqS;Sm?{+KP({>5OXj4CQ$3 z^5iRx3$$kHfX2P9OQ&&Zses0f3JPjBzlCkM_K!5KH=V{^_>;!{bbpt|jotklAQk|; zh;G38<&p*9q=|D!3?=wF&bM~{!PcjMTu zcHexYe+nI3&DZj2en1$+C;Wl&(%wjxP_sC-kYjH4ndb0Bof z4*IsQ6=h7Wic#cYEF8_ZACT5n95TI(v6bcN<9)l*-Tq+8;hPY< zw9gt+Kg;&U*8lOmdC4+aNVs7>;u%bdJup)4bLksCu1Xz&P^aTQlV-oQ>yU;Hr3kzh zvw>vW|K^ycZHOS^drBke@7DiQQ|Q>m-r(?WrqC}u#U7MbRa$_E|0MPUm3w*%m_pru zFonV?O@U+mG@ieiLQm50ShJI5!VB7VQWf}* zYO?avNNmV_+BSDn>-YeqYy8@FAN5;N|#K@eGNP%n^f%Zrw6Z}^-qB= z2)62ZsI0$^{~|VQq8{qA640n33QrYv0T1>3uF<+X-8A6R69{d|Y7mTM6NJ=?x78N5 zP3HHir&*nWlo3bPR;tpjg8URj%1qerXf|BC4afa-e)YyH0c8e;gD-bqJMe){Us+r{ z!s%G~#L8TBY0k-;Kj%E9@(+VHqtwcvV3|Bj7|H7;>E{@zxl`q$L2AAIoWH0^O*>;~ zR;znZ5K`1JL$EWSaOK5J$tqtf`i@;ri5v0*_C@HElatd9U7cgJyH4!lgad>$r5}8u zJVkdB_?8yuJT1LWWQFK|_l5exzxhH}OOe1An#=vX4glET5_AupRqWwvJm2p>d;m-; z7y?YcVY!oZe1hSB50<;1&Ud{1J%5ajw4aTj8xYv0+w#gz%WE#~-xh9Sq&MA9v!; zesUUJ-Rbi>CnpH4BKto9=Bm-bTveTSO90GOQ2=1>LRR z`x5}!0Nc^}?yo=ErN<{2wRaYF`lO4ngzV=bm6^hB?UcjfEbw9xg8s&l{w#JGMd?S< zi^B61R!x}#&w+CIPpt`X?!^>Q`*A^|`u6M-JJbvj3_d|KF~K6v$%QM*59{BGU&$gB zkC=8SM%|piD;CIr&_#pifG(DI>15hSl4ftu`GXh50t&_J975~~7pQH=WB9(5AaU|2 zW!;YiT|dmwMdgm`8YuRujdmZ4u<>T4rN9DDsdC?(^r%b$Nw^SlQ5v7r5`_zh#}r9e zk>s_Foexok!K8gtQBXf!c-2L*O@-h`v}Pq*XGB;yX1=m1`{VYSTd_km-@p^HR^d$&Ytrwjl$HSEZtUBg(~IN`V2Qy zgnDaar<8unwgKl}wCt&TEZrb_;MiXVQB8gDrd3tQMt#1Mg>rJW?w8V`_Sk(8Q%e32 zLHNL(B;R>>-fD6f%_=;I-dApC?dWecGMKQ4LE-%sHL#OwLjI}xlp^w=c9Py;Pnc`^ zmSceS&xhlo0Y<+Ux=mty(%}U6X@w%wbIE*>W_F4pAKk9|KyQ|HO05Ohjwr}VKplz% z^A~>5h~f@};7$WC;h;84l2VIyx3gIzsjFNxzo~iP_7cA3*LUqFyofCqfIZYbZQs+Z z2ucS9J_6sKh$Q$K74 zjO+RwbQAH*X09X8Hj3JXP4Si3(U|3QjjK_P1tv3)&MYVRoAow_NX2oJXK<46x#F1% zWi!X6h<7E(0_hG@F~<0-yf1L+*X;(MQSN6f7(IfHUJlqcY4FZ<&t3)Mv8n#82TMFL z0?;(Ruw9er5!D_*@@_N8Cz!2 z;#m0^c%~V0Vln6X)-fH;#z4woL5rXrVcXe%T~fzs$K&I5kMP3f&HD0zou94>TGu+r z5v*%7&~3p-H+2w2&TFg__g1W!Z!l!j4UHYT+eZE`P>Vh`c5$cd?kNM9qohn^giQ~A9dT=M6pg4%J`#8l(dg~kDo_HANf;Ysq@_^ zceR$w-Of&~#BAoI7{Y~1SzW^y0HVEN2Z&X!3xiw-O5dX!)Z}X2pE%G$hxWTCK{}eG zxTb(s%l96eq5H4oW^2l+(tO0ZQuT{u%jI8e)Qt-u-NGyIjj1E=wF>u&2ece)Gm15X zx)i-(%+B1^)lx`{Y@Xd)ch#3?&a?VF%HZSmdn%ky$(By%QlnVZrNB&+iTZ`(Q^nhT zmKS@hhL%q2$8_5^LA1sF4_YPQg{vOMV*qWEsaGb0oOfAJaZE=#?~#d&m#2pSDI0tw6V+FwlI7(y{~v~+xAFj-YZAuv54$QP9K{jj$FkD9}!In0k;X0-dOG;@)`2h z+LE*fA-_MiV`*ero;Ke4}}+=4AQ@svpNzCo_9XfEnVBP|>bkmFsyC{+ws z6tIitO8klDDmJTFPN0z+?JD7&qBR4a4oa;`sg-$#6t;^dd zxp|s$r{3gz>_pw(UP|HyrFfjgjk}bbz$$;G#ELV4rMhk3Fzn^M-MA)Br_O}SnV2ktm!LF zFIq8pV+k!SjJUOx5lY=3^I=XN4&8|tBb=X#3GnzH3DY(^AP;#`@$mwiO)ng@mBe^V z*x5e$T46Tlq-vy5?RrwZroQ@+c3aIlr~AaGRrHB)ef3t&Q~Q{qOYcg0Fc+0Ks}F^+ zh|JTAW#+XY_qw?vZqAS28e=vZ>fn!Q=c~(07FiZq;m=)~z>2>Eh8M;%*JH!P3~W@zZgU-eW4~t$ z4x%WBRu5@AqKQ5zsmz@C#@Mvqhi3i|CqpwKHZy|ZY$O{yyCs|axs@YDP*gO|RRAU9 zptU1SH<3cmY?Ly50Rg!OHMIsmM{Qqy8Hwg8AB=o$4Ub%SeVseF;OJm+m@tX4WwNRo@FB4tNeG&FfO~$#LDv~);Udp$7dSKel`h@?O6Y5Ga>^VNl&loDz{sDPU!IY)=BiZf{ z>H@1yx5AoR8hzMX@i4C_pVMIsKK(?a+*9D30q?CL2d<~3;^ zjRp8oaVr7^_gXDZy_P_{grrqdgUlqbk2p-7#3OcFY$7oCokEnG%R?P$%Ia!4t_RWjii6DZ8v$465K&s)9%6JDNigRx;tWOr4>!#P~ZFSJ$U(2Q3t zeJ;E5_*T9;DV@htG%~J-jfF!v0ueFfn&Jj2IHW6ZG;xhrpoe#s^OSV^2a|z2Z7{nR zvU6umUfUm3@_0e&mB>uqtm^3UuWhe01;)^0JVZ7J@8H6fNWSDHsY1VBIgAKsT_Sq^ zBjr{CBjLb30vks@GvOPDhWAP37VhE+dFYS3Et*1(C%FYu zTf-+8KwzD724{(EzblZlQIzs*Mn%Ck;X11c3~VatN;L8nUI2OG=WyqiKR(n#rM z*ALi2#(}D099J^km(oF>$w|-2d(`ZD@RX$2k0#1-dz68B@SXM$#c6@T^Vura>r7H& zv5~g@afz-6Zzbi1MZ8REy+OEAWT$2Q_Jby8t%FYqxi*70JS5|>!)EW)8~*pucdRiQ zi7sBZ1C43NG)#+$v68`3+}F<9;XK{>IufXn%X8-%b}g!q#rO+_DMPG3Gk}y7-LD#G zFXurjgzviv?09RXoOaeD!LlbV)+*ZAZaAq2d;Sd1 z2CSlx)Uk3wW-V?N$h-IDe&Z?))x+1oLkb@2pO3LqIHjH|VW+@62mLj7N~?7ioW(&6 z+(rBGMsLrwmGBm!RYi_bJq#-zVC0j}(Ks`y$9o;C4G1H9b_yXNo@z;8=X|xq!3rRN5bob z193CX^uD=Le1uc>sYc4f8}M{IheWyynSK*3Z-w9h^K)Lia8H1gW<+|UJ#N+wtd8m)^!gJ zrNUJt)L*0ZX8Ew@3+5hS?6Ezc2)lSjcNn4q-evKdYYcU9Y@H$>s|!!*9TU()$!5~- zw%FIE#{V#Uz1Toey|ier*#V?B<64#V{VqP&UDF4AB8tOa^%;+QIaL;H2UxXM$smF* zB`W`k7FULGG_Fd_%7jJ_RPUn2BR#1XCGxVlpEV4y+o!bC$zr~`C<~+g1c9?_X+~TP z^igc`lzrMj^*gtxZhjv;ksucHj`j|oo;4SHZqjt?5LDbMnFD3t*`Pl>d`5)HX}e_e zZnPkim$>jglRP86p`--n_Bp4=%n21(=ieF$9W7hgt7j%WIKVybFN$$!SkImW~ubFTTh;vI~z{Y z#fS0l>>W|!yaF=wi&$_pSg?B_){tKW;{bd?8Pi$%*w0a;l+uBfPi_@gKI7fEN- zWQBy@*2LB2N#4J$Pu`0Rc9>XhNh}*8^Ck{XTxH5)>+^jjKk{p5W660IH^}}`2c>Og z&LOoAEEDv$yTFP3-?#6EgItJ=Rh4+eU%c_H;F};Rb{ycU{?|b^L?u30F*;Iyw#@Hy z$HF&&?jh2spFqEo9YPu> z_WHyHLz}Z-_8N}^R>a(R-pgb>RHU=~e*1;wHlfXbU(F4IE@v*RLCRO(6)-K?U-RWyzum|WNPYPq1n)95-Mjj9MH^!BYDn;Om}a{c6j#B6J~Ki0r__ z@csgLghm6sGvGvTFpZ{_0)@FQr#FoXdnLX~8=rtuTB3k5RA5Z8N^JpsLfB*9dqs?W zA~N_cF24Cy1gCxh5v6LxKnf<8R|E*~4PV@sw8FQl)xyd^SJh6nI7bkbDs40*mB_rA zvL&GFV;kubFyzcNy%M zG!&+;>d+Sd>H6B!1p&Zm3x6ZvORkuDccHL;%K!PSYgjN*vpIT z>t-?72ioSGGR46VR={ka5^{`G(jQ)ss=Jthwo}7GcW{! zRhVpHwxS|f__`0(Y_r`fBr2Q-o_U(0bkL}MH>b*3`^VGUR+Nb*D?>XP z0&1&z!9spd_v$*a3F0he@BAqqHbO=Gsw~$vwRiM z#+-^0EL=ZGQt!iZGy5S@iz9Ju+r#4pb63Z=zf}1gQ}^w=_h!c>Aol$9z?T9;Mg|#P z@OEIQ%JnVvCz@@kE?dBSx1)CESFN)h?2g$Pj_V&V=JkWI!1IdRJ!DK>gBOK$7|Bsx zQN@LMlJmL&kB2i)rj!h#*wO1Ots~O5&Wc>8_n#NZ)Bvggzh2PX`zgDWRaIEw;gIby zway72pY%>Dh!XLGv`C(wD|>W*QK4-~a5xt4ENQ2aHBh2h56n(Kfrk{wKef>fk{$^j zT$uS~i4})2U)S81@aNlD-&I1j`^Sah^h~+pnZbnEPD3()D9X#ruMaoaACM;uYL zoZXV%+jR2##gN`BR%!vR8D-y0Ua25ru`4n8j}n%QRF(f zSV%V1lrWvUuSq?TYGeGlab`FHA2zNQZYbfGx({5KR%=1KjAFLKg3-&aG!~bsjg9%` zE$9GVM9t3)A#G`&Vzs}WNq^&zpWy;U{ubaEF_M2x542~c}ByfxOeC)#)wOC3RgG;813SL3>nKOer)LsFvtvC`0ZU>$r_)o@cbh9_Y`JQ?EQbTQ3Wsd< zWx3GAxYN+Rq3q@y(#$DXaiMLXGs_c9`RFA51msSUNkiu%pKAQ#W!HO(YX~)3O=Pwh zT&Va(w$=tI_%y#m{~K#;l2X0t3wahgfokkz^6m)4RrX|4 zX_$alS9v;(jOeF$^RMNS7_b7Q9I|A`fPPXQnaV*=OoD;k>(v z4-WwarBll}Y3f9_KDQ}gh**&~btP2SI%gQSYmy%LRfqY!fiu$;wM`m#@lnYv^R&}X zT_uP&qqS!)fZSh63%pe#LaDw;x)5;4+$npb7gY7WW5@?J{{@B|uVI>Iuq8|_eF_)6 z2N-jx1tBk^^m1l3mFB-;=DZ4(QU-gw+kf;xWnMbDxf8RY1UmO&z8_?Tu@UVc=q+x z5JfJ8Ag00{2p5FX2mL@CqZfrYQxhtP4l_Nw)el_9F9*kx? zmV%DG(|FAVT7Y#!;d0FAgmPQ>c3rzFQ$xa=<0x(NVtDdY79Ct(}TM~pal}|J9{lMMI&&y7P&suHW~ua<3du-3;Q5~q`-c;F|9XzKIDq4 zkBJY2vpiVGx3EYGx2j}VG=Rsl*R;eAugbOwGu+qQEkHCve$OLu+BWuV@BFcktUKs$ z=y8}F18#Es1ouOBohv}*?tp>#is!2isoUGpy}HY^62>^XSNE=v4U^uecWYstSHU^Y zet3S~ulf(8=^IQ#%kX(5*d4D%WDEvrJu^f~^r>Y^-K`q8lkuu`_g`3|=H z+ZAL~%hd(+By!flXLCC0IA$NE#_d=5W}Kg zSuH#_vM=9aVL(9Y9y7z)LAeW{5x4oUVV6(ua3kTp#dufaDrl+K3_ zrLTqZz57z4#V5_}_cVL7zgy<1h7)f%LM`U(L*=x4igRNAHuJtQq!YS@HEpr^3nSiN z7$4LGQ8dBfqVgtmt-Zurprn1wy)@k+@whiau^vsnwKUg{(Gc)w>Xu ziER7+re~0#P9$Y~z`x*HRR}W;HB{>}w^pxl!N9RDYF9!BHs?6I1QVZ@bO`N2dN6ni z8Px+d62o@7yiys}bEpKn%L~|PlO+3vZBQJ}vZ&SAuA({?+oE1*k;aoPpD4gbTjv!? zbO~t4Xl&PPdG#VzKJ%1jWqiA?hRb$yT;sT!df?MxG!D-2wy_TgC69xmsai_^S;Bdo zM(Xg;>8X#;$z?NGqrtS*;r_B>b#V9P7(n-eF3k2vo^!2T^PH_EZ1y|$9S(v=r7dBl zP3Q)?Bm;&#@$7dmZRX2ciJx^Zvj7nlCSX~-eezcEv18`6!p&+7xOx z+77}~?g=Nd3st4SWUL5J{R%~Q25)zFF4v?7uk=q2Y%e(ztemXM#wO6l7;)NAq~*5s zOZ|6YMQ~b(GwDXp1uTtfth}aV>*6E5{IQ^sd~7trBY_9X8nca_7#ZkasWQpr$K_BO@l{48V_jRTnyD3bmn+9_o z^)d`udZi>@s%QoqGQ6;lLFO2lXI#6)*tr=VK@S_h9DR7c0`md#s0T!i6GHQM-NpBR z`ELWH->===-D%SSX_0}lant`4klyPwcK^oEOUs{2@1%-8vHN^`EG8h7{x5``;j6C# z8yf8?v0=-IDT|vplY=y5G&;B>N{6(;7=(xJP*!&b^~JCrIzsi}pGCw2-k^q${!aks zpCIv{#mN86ga3)?x)h{+-k~i7H0QKl`hj`|1oSuXIbCPhpLpl+|97b6|A#Q#|0D3U zHJ%exQ@`o66J=v$(ZclNZG+td$$eEA>r zFr-Hc1q>aESK|FIba%53jdz<2n6w)i0G@lBOAbrdGwfrq>I2xc_vi1}G@lYm=HFn` z#QuMQO~()YD{NXld>5N;G`rWR1F-4MU2NJsh7Lie1htRpV1uv!BR1Vb$EMSRzDbPl z@PImY5fn3UW*phYL;EZW}e|fD<>1jjoS{c5ecuZcHfmOJeL-< zGEaO)2ur)myDe!c(>6+E=7$a0o?eZ3b8d6lnbx0Z6OF>FxTgcDjM<80!=_Z+XCkdG z$0yC2AYaJAL~N+bsuzBV5`6M9EDD0M~$jCGFaHnxAH z5-)KxyAO?eigdzWol4$++mVG>*n2bJe&rL3IWOR-i^=kiAwe@;AbqRCp!i8C#hU`H zoq{tm++p191iy!ALMra%^wwT5oK_W4SXr z-RkoDNjl=4FIL~C6AEt_ir44xV!S||ZMS^SCZ^?N!jupc$s8sP5!muD&n})@Pnism z-gHE_;Rt=PcQ&aE4{VFX!O@B@H#qu*>iIm-F)EWf&2^ZlioJWJ`3{sFG!#iXOSNaO z$tO0HBvu+)7d=fj=FVt&EY*Gz6iaAc1jQ0zJdH@1RWJp@e`}Ix+A6!aMoVQ)6s8RM zj8|2w)6yeS{c*d-J|R=$mTqEw3U{rWS4U=}n;~y;L&T16Rm%Y>?RH5A|GT}%8Scx1 zlS7b5Ufr@c9*eG|27QAD5|q)~otZ<66bpU$H{J*>v8bYKt`_RGOU z$~l%pCDA8YYD;ER85rzeRu%D))11Lkw?F;27w@@!`B&;s2DYZi#h-DBNYNgir>j?s zWWBATF33a@i_F%jO((wzpWmtWE}X&K8Bdc;2gKEXhCn8S+Nm(6>Av&~k3y zng{t~Xv*L+&({c@Mm|w18mhEqQQa=E6mOX;cha{v40IaiXVpPEYRpf@NVzAL6_;8* zcT4F>AD-^aSG-Y&i1YfU);ndelj#v4>hUm3QyhU9nqTnQ9M!RHbdy z$!dJ_ef%nGuW{ zSC3z~<`L-6yKn$VkOvg{=X-t>8m}V@D`SDsSgKX}Z2Qq~LgNag*GbX-rGjmp^SD)8 zURAu^#%1aAq5BzL1F_Ww!fim6f|$xS;wv~)e0=u|@Fuel-wSY&2mTdPaF$B{kAKlC z`{zylzrO8%{}Uh==E7_yd}Ft;fhkp>Gc*OoEseLxi)2B-AI4VFaEg}7d`xyPWiv4Sop^b z*_Z4f7v_@W>K-7qVGvL;9CL<)3P7_JyNo91qoSpE6J+g@W&K)Ta1=sfv$6(iUQ49M z*2D$ef=i;_gDPAv4-;S1>&c9?b(U)bb!8b<%itQ~%i?xUGh-GWk)|8$IG4T|Fz;q! z;G`c^!Fgq8>s*n8mIb@==D8;vMi1PHJ@UP}+3H1>$Q79h@k+J_TNi|qKX>%yZG}H) zH-EDcTVkFbTzcG4+u1)-cF~Zh%gyZpU1t1_`7fC>b?@lyDm|QW(QIxNt~C}{5kH~huot45C_Bx`Pb9? z<$HSU8a3xUJNCldQNNGnWA6!dze7oWy1{Plh&n#P%)iLp86?P?s9-moNg+}O ziSRrAKe@lzU)ve+Ny9h1g7%nIQ^*AEzzr+3B&cjnGI3`d)OA{hHSPGnZY7swVY5zi zS`MW)n2!{XEJ^y!TqLh0E_a?(+gODPMT&%DE|^gIRvvGuPI(|%zZC<{9V#H4Ah@?F zh=+_H*h2h`mtO6-ZjNG4%-ZFpSCy8Me&nSef90i3_piTgvw*eXyxv}%Jw`Fyt#s14 zr)W4k<^MbaO)c*{Q{H)nyLnjkTKAGprY6}YEkLpG2qD|zmz&JR9-|}K^z&uCV#(1< zh9@I;a^MkvO^j{jI=QcPUs1Z3@qxSZWx>?xg?kV4J_;LJw9aXtH}MU#E2i>!-3xT5 z=x@E5zCOAq%MBqCH8v68XjbblHJvCFiUDUndkq`;hmo#T8=ejRrXHAZYZ%6@Yy`LB zBhpS<9BXg!Hw_r3CFIHI7k1!aGhZq%1@K3jj~uoSn51B;mh2(T3cRGSeHgwX8D-cK zQ2`@qkL*VKh_{D07)JY9rE@g1G`=HK&+YJAS>6S_^dQm~@eY#I8TH1Gy4h7}l6i={ zixxNf&G(dUZ@g`T+Z#q-oYm7WYo@3pvdktU4Jl>n`3Y6N-r)%tB~q_^Vgjf#FO0h- z<)nGIITqwclxJ;!5E-BDeN!wM(+RVanZn^@%Ej$Nd#YCq-UTLDT^)0;Q?Y6*3VfIJ z3kII6OhAJQ-njP8IYWZjG{D;F;#4g5Uk7|+!K-XFQs^fvL?W&J3Rjer((eeBFNMF&fD`W=VxCCgRPA*2iDm1Q zebsqh*#j4+6jRRgjvTD&*-eh!UQ8m^`8QSlPQV%7vB8=OR(7HsTEBt5J2pY_#=TidL53A+glLBpns zc=3?lTO%)FgVq=9!*_hs$l}C+*oYZbz;!*!Y8VG&bx-)pC)I-SJR# zFg*m3vReN1fN`i}=aZ=zf>|$OV^HlBpUB1@M)&k5HuY7v?gp4B9oG%g=>VTQ@#;7KBl2Sar$`klzIO6W_e+$};$Z7dq!?M4^BSw@AYw3>Q@oU0^o8?n5GJ5ersMgQ($xC7- zv&$TyH%{)DIt^>n3sVubAIjXwpHQ*d#3ockZ^sQS7KErj#x%#I9z~{G~vq{7ju0QjX0DbT8=WjVVbZs(%y*|m)g5I@18^WaKLGzm6tO% zF8}2OdHSL@x7cZn`GcaXK8>J`0x&veHgNw87NnX@CUnFKR!<>6udLia?eGB1&+gl%LZ>mH*(>f{l zFf7#?vWHFDJzi+eJ*ZT06V zkjAwJ?@zn6U#pvAN1&f>KMPpo=zpCHG{$@co5hU@bM8*KjcoW=jWJkSoPdskh})uZ zvE=nqur~bS_Z^jHLVQJ~`MtlR()@JW0DUd>CGmS?{eNNI_~%~%5RE+kSKP<(U8}%* zub1}pzGnI``@OYt>;8Xt5Y@K~QjEec{{RI>GlVoV_9qQtYUc@D0?gqBiFKzO!b%Io zM(M|64@;e7vP+d)&)k&i-~jz!Z+2_IE)B_KI@J(xufLXljakmQkNrdDZu#o>7v^DS zSOU|3lUd+$xU-0i8(LYCNR2;xH>+3Yvw*85i0 z2OL|BqQ-!x&jg(NwXk<28d&-2I`}fTHqRHbYD$gHpS3PGSOlfgYuekwF~LeRouq}v z)%c*Bq+DP(zL>xzo2oZdi@mh_%n|J5v~=a|ptIfzB0VC2^1Rtt%=h$6y`MGje7c)x^n|d4zM3vZ}(J z?z#Vm#v_TXP}lKW__FWA@;DO?4BdFl?Pz!ngUYy-$f_7wyw8$#3S2S2V%6C?hTY4H z8RVl{r{*&v;fnoh-#m=EVR7J>I#o>4xpmm>TGVL+o34FvEFpK$4BKe>?7ZmGY&+T~ zzc1LeWXY30JD1|+bjb&k-Hp4e!Sy`>s{zt!=o`E~^L}sNw1(8ulm8JbZYSruBz{}W zH05rslBn3{$B~Ue{la9Wl%$xSjaMj9u`EK(Xp9a@fqR@+H>-i~;NJUr^n(GV=w+v9 zFj_8E?oW<{DXx^5JHX>oB1>**hVN3?KP!Euu(xezA4I(DclHrA6zL?ey-5DkqTJdU zf#3SA$<@z=hJq(qP%mPj%zqq}PMfzd!&AoBf(Y9~9%rhv)|Sy_3*$ZKyUEEyV`SnM zPs(*OtO~ZY8tFmTdtvz^?N|1VeE!B^b9&f5SFFIlF+Ld!q>Pcergm`|Z2zY*d60BbmE?y(LiK(&(v-~e`Y+qheve-8qn*&NXUf!^& z$Xwj9D5ru`HL^(>$mHp@%){H@Jxm*=-1J*LI)H(T69yU1fUS$8weg)(WZVtUTdU%%B@W8ZEOrlD04I5yb9R4 zoZ0TM4%-*n&-d$y`dKZmD6I`rz<|X*qqSGhuCZXM41cEUXGGtt<|!}AK!13EQVsil z^e_)~xQI=l1|}a~VVOpOqq?FZQ>@dcFmP@HM+&t$hU*DA(9_VK;GXpXtqwiBY3o<9 zOkD@+t#ebRae~6F;rgjjjYUC|*h|akwKs=K<@ufxttmVl)~S6=1id`k+e$XI+5#c6 z&NjPgrgYimuAuzU+fq&rZxI5ph1{Y}*utmVJkAR4Ud!+4@7G3|Q&l?FbrbcsS!;IQ zrfJc){GLpZPwsc@p?1XLXALF78`TLYtJc>I##-ZWi?PQpUiyQdNOx>Y44D`2B+;z| zI%^99O;hclJ8Vln=!1%DT`Tey5YpvNrooja6{SmK+q2%m3Gu-=#qcdJMbX{-oi1U4 zTAgAQ$0y9_A7gyJEUBI)p-^77(^%jXqa zjpv8S1Zy;rs2M^ATBt&RpA&U-@31E$W+Lja+@FW@!m5%0Xro)wYKQ;}Y|Er~D4OH@&i%6l+3v%?d`LDp#Z# zHlDZSh*Vd{sxHsxN~@~8h_XESYBUacd2cDW@B0JmdTLAF?n4`ZKd&^N@$(qFv^(@> z4E4rHc4M4d6h{C$en8{eW@m!RXc5i{kBIP>B0|nWpyIWASUwb#=CgZjAcUGrZC&Ds zT6*6t&r(m%p#!TYDbdH#Q4iAZGB*}X4`ggS6AI1$&BzCL!41z33PqWf2LkQq6ly-d zn7D{T z4OMMXMO!mLCn|_}7F4xVRh?>4l$0V=OqHQROfeKSCB{UkAvC7YibNvGjh=IU_pbH( zt#$AHFKcJ-wfFlb?|Ps8Jm2T@jZ)=86F9)??BTEi-Pmsnz=5M2^ZaL1Z_&{GC9w6a zs3k>fvy{|-io>3z(&B;3<_YKH3Kz0`rHp%KHbHqjW4X!$axtt`rila(F_kb6q?>J4 z2UbbXTAB0u#Igxr7MSo8J`+jjqrE>A>k!4njU*|0+Skf?Pfu3wCr_EGVoh?PzIFPZ z&@@Uf$vF~LWd)hXt%9kGD+NF^C!hty3+*FSYoKqsLxkQMsiUOYi|Ux(aUCD>*h07A z9_$dHH(J8==J+^#zZSOq^LJ?}aEq5ftfLk8V?|)*ilFTKj2gk?Ui@Ugxyd8TB`duV zVR+Qa>A@J!{{;cPTtd@wJ2vWKpswi8(xcV8ZL_yi=aV`Rg8D^)$fq0E=)EzoVyFB9 z`u`IFy5c@;NvK0J6~azjNdmD{JRobA_P%Maz1yXr<~*LCTHAn0eYmA_THanx1*w0$ zJ;>WR&R2l-GGnlz5E!sz-92%mhp4?J=;WKY5wF9Oao4(49&^!~4fWYI75^zY*k)M2 z?!P(FeXhJ+^aRgWUHer)QMl$*(nQ1 z^L2Yj6z|ZlF0k$sAXUCcj5+)}Uakv?`pFO;tWU{dFsMZfxUH8P7Rc4gs&8HuXG|vZ zl9k_%?Fh5`N0hObvVGg;k)HJ9ZE+ecJgij?2@3%T@$m)p@%GGxw0}TTZ%>Kr#G|GE z$qZuWi~m&WBpgLHrOXqjuDV#w@Hg|H;LhM~Wd4E3sVdk!%>$VN@|?VP?8WZPMQ`={BxcXNl%OgzRJXv`N28#d1Pt9z4%t%lL)VD#* zIuRD)m6ib{C$m|5=~M3Ty^Z(CR==9r{4gdWY*alSd7fwI7eZS4%Q3lmRNo)A4F0rvrhK{4&`_q)wqt9)6!;!Nb<`I-5HX z5^9WH*1@m(QZo^rI!>R6mA6K@6M}qeblyMg3eWg$bHMY{@GSp@9cu{ru^WioK_V1+ z%>-*(i741OsKz*@ht(|$Xna#<`Opzw)Ce^;z^4Zjg+y}$OlUl#%myLA1uN(tLX4625q0bS=pTBq1MR;AaWUMx?;7?z+TmSS z=Zbva>9ks1&Ua&Hgu5hzS1nzXA3jhMC)_gJ#k=`YF2ynmB?7EdHsMZua)q6f`HKFf z)XXA=pW%&Exr89?7wJqIC3dCv~hC zjeXxQMhK-eA=>CM;na6KO0>LL0}B76rbj!ELI%_)MGWS6mvEv=#{EBNzIa`IAZNtK zTDo5-+bz6}X9B7<_FEE37CzFHRfDYI#Tq0Yp!sZGo*AbJb8+6%)oE1M(2Qkr%n|2} z!x`5&8}9m}9d!FG^Loh5fp?j^^@?oVj9-agi7dBuaOOXaykb#%_!V3AGgY(W&hPtg z1ngethRZ42R7EG6h?(NgwFK0XNr!L75bpAfoxa26!W|%baeRDoQfl8E$DnJ@&3zj1#{eP)Y-1Xz|-WP ze&@WqWkxz*xTt))*QN`F(%N1W>v~mrpEfGW$*gta$<7V zCSKKr9YqQioiKq6S)Wf_+k4Vw2)CPL0r)I5#W*g2HM?r_P&)-5-tRjCD@1KHKg-uk zpG_~~_ZvTLQ$%0K4vS5gXhtd3bq!p`80=YOBs2^ellT|k!wU9XO5_b`+F&ED_dK-z18KULcjom?=QZPC!H#1RH=f(L z__tI=PpUCAG4&2*f|-?U=sjR7)CvWn*yh7txpA=?e8*4E$UI1vgUz?Lc#mfm_-sX0 zjjp5Buv@JN+c+T)ZX_Jueb<%qJ3@6-atzIi%`+VF`l=@7Gw0^!mT45b?T8Tu3hIxG zdYuh5dYdC3aIv(hIkXToL%69J*u(qmVov-gTajH$|Jzr3>K`RdHulc1qT8Prg#Qxd z{~r$2hO2M;S{RH5uZA9Lj86!{Jk07LZ{J3d#|$mf#TPCX^Gu3wr)8K{w6DcdR zxoYJ#9$H!D^$YprX&G}|Yz7+iz}9n}913tR9@8y;p>`D%J*90%bs_@j#P{e8JtR8v zQvZTl=H`|e;kPA>_=f6d?V&FG0V)7eL__HygUXWFiJ2zs5mmjDiMMj1FsRC`(E)$i z6uCdL`dp+~)59Z_C5P6}bC`zHs}@pOr$7>dRl*W9$hI{%JhM&g+n@&WIP*{563*-? z&LzwMbih9!DFE^5885kxY(&_?K3a{>=f;fA)~qhq`_uc#$t&+Iv4|Na?h{jq{Ik&@ z%y@sJ;VE-Ml{RALW_EiP7*x&F3h_Sb{#Zr>fU-k^ykL{#0}pG1T2xEynnYw=WJ%ab z(aVC{-3@mrdUpx!I0P#^8l}?}Rb^?<5(lWw$js!qgk5J3H+~ua2Gu|r)*RYA=N!iB za|yjJ{J}WOXOwhj?b}bo>SBW`fc`178OQDYbs>|uF)?S4f;+UHPQYEZx1+F4^`A>) zm$+{lLcJp${Kn5IOmCjNP^oDdWg5aK=X63~vuV(4?v#o0$v*E`>9nx5qjk{^-eJpj zlT8lFWY=)LfPQOLF7u)n@qqP~iuRj+-pa@DL!3G)IAaDcLKM`V`nSRJ^GVIus9U#t zn?^FVR+>4bFFTVIPjE-sACwJVET;uT|J<~$xfMLiA+8vNV)z1nU$@fsY96dgBjGw+ zic3UdhwH$a&^Gl$(jmiob}XOQ+(bltpnDupEvE7Qy2%_a(&E06mt~I!K4ZHpBLE{F zRCfzXi6uT_3`ALX<&KEIkhQP5sYhMqdjq@`sky<B6p=jjYLQG9l)^3EE$w z16c4V)t`tVibmwN44*K^oqA-t*s)ruNP?o4`sTuI@KUOgelaj~CVG(Hel5pqPCtH9ixT04uiKy*x3w7az9}VXB#YJ!5sG8IZvC3hE3yw^{k5#+6` z^wp%a>P1iDRPeN;`&6~6`7)&{S1`P+ck+RHLsDO1XlQEj#y59p%(a<=*Ev51OFjXw zd)Wx083Wc1M5TLD1~}8c8vzXANQ(QBF5}E@RJ{s(xbg8ujzt!JH1?cjPY{5 z?2TffHTrvD>Ix$$f>(($Clukn5Stg1Mlx z?-~Ye$hzK@RF*8G!>lY}{o`+C}T>v{{9Xc+W7t2I9N*EPo zqqMH}|DhxiBp%%B%bLohIYlaS8oqyR)D^%+`_8W=9T{?9`H{V;imt-mm7EY%rrWxFLg*rVkKmYDI(+m-{1;5Qtng%?>g;PT^p68M zm9Fm+#a3^#^S&22L|G;BUw&G@(p-24gMF_Np3u^4G;o=_>DqZO1BLJU*Gw!$bTZ4F z%EN`gk7<5Eplhv1Os6}zcu!Pe{?A1S+3(F63}Enp9+p*FMuJ(SXeM9 zaR0(xx8WgB3vTe0={?2Tyxqu?bA0j>x;l72_l3qtc6R{br4br(1)p%u3Xi}7lM6)` z&)yV75GSor{=fE~zHXhg@DP{L0_c;eA2$jW&b_#f93Ji$1W`?oxSX%H+Wrz#|H|Uc z)rlI$iHZi-dqxUmm#fQea|)3}4OT5ZzA&r#j&6s8snAE-`*>ITuo^)NfEa%>2Cy-N zwM@WWy7Y5Byh@8QU^ry?y{5-YxpXDv=SrbFidRk7Eu}pfGPZo4 zVK;f1*<_{SSaE*yUAsXb=9PlSD|=_YQLNnMXW`6qn-nvv!lYs|7K+taBHBVnWv=sl@m?tGzmV%xBG?99Z&0wqJd zwPMVM0%i&~4W_Oy{=th&eQ!2`ye&pU9pXGpbhbShw0rf$Ns*sTtZX|HIq00T05h<0 zj8)%r?=XYgq+p)=1PK~KtjJhqo&9}eX^Ak^)6V&ky9oCmNG(QCAusJAi7H(h-(QXZ zAKT?9$;n5{JoNXk2p$qy#ynjyCHq8QTwbXN^%>k;^}}9>JQNcI<1lS*)=*!nt%FxG zSwSB|Wn;O=2(4aCmLQ00y4oQkcQ9-<(+Q9chhCTW|5$Gf%P&*4qQmoSdKd=*>Uv|b z31eZUB9^9|OV{6{8_x)mL4oi6PP+Q1e=<|9P>LC0iUUw8BV8oJ7ZIkNL)4?xE3Nu| zOpz7AjPm78?ism?-tj0JL3DLV>sE|zlKaEQoIjFD`o|U*u*{f-Gy{}MU!Y3tGk}Kw zD+OT2tW7t_JA-Wrp2wGOu7rC}4vAy?MXQvpi0|e;=i~1%I@9EkBGOXvM&ziGAM-fJ9zcj|(o0m)QPbiCzn#Q6Nb&P%wvlm5E&?+2D;gt?%eI8L$L;Zf_9J*cNlA%G&`RHaDhLDUPT zP(udX^>5|gzrU{-sHv&-G;y8IlHyV`H-CKl(ZmhwiBriFr&B4DT7y66+~0~2PvO() zVa|(cK*XQgfXdS#9I~r5ee-ghv9l^FCzEO6tXd?JrL$CJ~_wtp8iZcRX0~uM)IIQp>N7h2z1S|ET!F sJO1y}z<`=_87+U$=Ko{ Date: Fri, 28 Jun 2024 23:21:11 +0000 Subject: [PATCH 19/83] Corrected edge cases (slam dataframe or no gpu) --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 36 +++++++++++++----- ...ustom Embeddings and Approximate KNN.ipynb | 37 +------------------ 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 91e4417671..36a39a0e17 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -36,7 +36,7 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): runtime = Param( Params._dummy(), "runtime", - "Specifies the runtime environment: cpu, cuda, onnxrt, or tensorrt", + "Specifies the runtime environment: cpu, cuda, or tensorrt", ) batchSize = Param(Params._dummy(), "batchSize", "Batch size for embeddings", int) modelName = Param(Params._dummy(), "modelName", "Full Model Name parameter") @@ -53,19 +53,28 @@ def __init__( Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ super(HuggingFaceSentenceEmbedder, self).__init__() + + # Determine the default runtime based on CUDA availability + default_runtime = "cuda" if torch.cuda.is_available() else "cpu" + + # Override the provided runtime if CUDA is not available + effective_runtime = runtime if torch.cuda.is_available() else "cpu" + self._setDefault( - runtime="cpu", + runtime=default_runtime, batchSize=self.BATCH_SIZE_DEFAULT, ) self._set( inputCol=inputCol, outputCol=outputCol, - runtime=runtime, + runtime=effective_runtime, batchSize=batchSize if batchSize is not None else self.BATCH_SIZE_DEFAULT, modelName=modelName, ) self.optData = None - self.model = None + self.model = None + # Placeholder for the DataFrame row count check + self.row_count = 0 # This should be set when the DataFrame is available # Setter method for batchSize def setBatchSize(self, value): @@ -102,6 +111,13 @@ def setModelName(self, value): def getModelName(self): return self.getOrDefault(self.modelName) + def setRowCount(self, row_count): + self.row_count = row_count + # Override the runtime if row count is less than 100 or CUDA is not available + if self.row_count < 100 or not torch.cuda.is_available(): + self.set(self.runtime, "cpu") + return self + # Optimize the model using Model Navigator with TensorRT configuration. def _optimize(self, model): conf = nav.OptimizeConfig( @@ -152,7 +168,7 @@ def _predict_batch_fn(self): nav.load_optimized() self.model = model - + def predict(inputs): """ Predict method to encode inputs using the model. @@ -174,8 +190,11 @@ def _transform(self, dataset, spark): input_col = self.getInputCol() output_col = self.getOutputCol() - df = dataset.take(self.NUM_OPT_ROWS) - self.optData = [row[input_col] for row in df] + size = dataset.count() + self.setRowCount(size) + if size >= self.NUM_OPT_ROWS: + df = dataset.take(self.NUM_OPT_ROWS) + self.optData = [row[input_col] for row in df] encode = predict_batch_udf( self._predict_batch_fn, @@ -188,5 +207,4 @@ def transform(self, dataset, spark=None): """ Public method to transform the dataset. """ - return self._transform(dataset, spark) -# result.show() \ No newline at end of file + return self._transform(dataset, spark) \ No newline at end of file diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index c1a3286bd1..3a4085e093 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -244,10 +244,7 @@ " ])\n", ")\n", "\n", - "# # To create embedder with different models, uncomment the following line\n", - "# # queryEmbedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"cpu\")\n", - "queryEmbedder = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"cpu\")\n", - "query_embeddings = queryEmbedder.transform(qDf).select(\"id\", \"embeddings\").cache()" + "query_embeddings = embedder.transform(qDf).select(\"id\", \"embeddings\").cache()" ] }, { @@ -264,7 +261,7 @@ "source": [ "## Step 5: Build a fast vector index to over review embeddings\n", "\n", - "We will use fast NVIDIA Rapids indexer" + "We will use fast NVIDIA Rapids indexer. This KNN implementation will work only on GPU. If you want to use CPU then switch to synapse.ml.nn CPU based KNN implementation" ] }, { @@ -373,36 +370,6 @@ "display(result_df)" ] }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1d6a8c06-8761-4652-808b-703f0c365f3d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import urllib3\n", - "import transformers\n", - "import model_navigator\n", - "import tensorrt\n", - "import torch\n", - "\n", - "print(urllib3.__version__)\n", - "print(transformers.__version__)\n", - "print(model_navigator.__version__)\n", - "print(tensorrt.__version__)\n", - "print(torch.__version__)" - ] - }, { "cell_type": "markdown", "metadata": { From 710d9a612c037f5774dd91760156d531088d5610 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 28 Jun 2024 23:59:09 +0000 Subject: [PATCH 20/83] Added check for cuda --- ...ustom Embeddings and Approximate KNN.ipynb | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 3a4085e093..13254f2e88 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -53,8 +53,10 @@ }, "outputs": [], "source": [ + "import torch\n", + "import sys\n", "import pyspark.sql.functions as F\n", - "from HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", + "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "from pyspark.sql.types import (\n", " StructType,\n", " StructField,\n", @@ -264,6 +266,29 @@ "We will use fast NVIDIA Rapids indexer. This KNN implementation will work only on GPU. If you want to use CPU then switch to synapse.ml.nn CPU based KNN implementation" ] }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c01d2c1e-837b-4525-a4d3-4938fd4221fb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Check if CUDA is available\n", + "if not torch.cuda.is_available():\n", + " print(\"CUDA is not available. Terminating the notebook.\")\n", + " try:\n", + " sys.exit()\n", + " except SystemExit:\n", + " print(\"Notebook termination prevented by exception handling.\")" + ] + }, { "cell_type": "code", "execution_count": 0, From 42d8a0741e931b46a6ef5ba65ba7c0bb780b1315 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 1 Jul 2024 22:32:36 +0000 Subject: [PATCH 21/83] Added synapse.ml.nn.KNN to run on CPU --- .../synapse/ml/HuggingFaceSentenceEmbedder.py | 2 +- ...ustom Embeddings and Approximate KNN.ipynb | 134 +++++++++++++----- 2 files changed, 97 insertions(+), 39 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py index 36a39a0e17..0ae73ea82d 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py @@ -123,7 +123,7 @@ def _optimize(self, model): conf = nav.OptimizeConfig( target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), - optimization_profile=nav.OptimizationProfile(max_batch_size=64), + optimization_profile=nav.OptimizationProfile(max_batch_size=BATCH_SIZE_DEFAULT), custom_configs=[ nav.TorchConfig(autocast=True), nav.TorchScriptConfig(autocast=True), diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 13254f2e88..8c133683b6 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", "showTitle": false, @@ -23,7 +26,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", "showTitle": false, @@ -56,24 +62,31 @@ "import torch\n", "import sys\n", "import pyspark.sql.functions as F\n", - "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", + "# from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", "from pyspark.sql.types import (\n", " StructType,\n", " StructField,\n", " IntegerType,\n", " StringType\n", ")\n", + "from pyspark.ml.linalg import Vectors\n", + "from pyspark.ml.linalg import VectorUDT\n", "from spark_rapids_ml.knn import (\n", " ApproximateNearestNeighbors,\n", " ApproximateNearestNeighborsModel,\n", - ")" + ")\n", + "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", + "from synapse.ml.nn import KNN" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, @@ -103,9 +116,6 @@ "nuid": "6b6bdb2c-d492-4114-a7e9-0ef2832ac05c", "showTitle": false, "title": "" - }, - "jupyter": { - "outputs_hidden": true } }, "outputs": [], @@ -162,7 +172,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", "showTitle": false, @@ -203,7 +216,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6885033f-6eea-4338-a632-2837582d91a1", "showTitle": false, @@ -253,7 +269,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0154ce06-5875-4236-8178-030d45091445", "showTitle": false, @@ -271,7 +290,10 @@ "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "c01d2c1e-837b-4525-a4d3-4938fd4221fb", "showTitle": false, @@ -280,13 +302,18 @@ }, "outputs": [], "source": [ + "RUN_ON_GPU = True\n", + "\n", "# Check if CUDA is available\n", "if not torch.cuda.is_available():\n", - " print(\"CUDA is not available. Terminating the notebook.\")\n", - " try:\n", - " sys.exit()\n", - " except SystemExit:\n", - " print(\"Notebook termination prevented by exception handling.\")" + " print(\"CUDA is not available. Running on CPU.\")\n", + " RUN_ON_GPU = False\n", + " # try:\n", + " # sys.exit()\n", + " # except SystemExit:\n", + " # print(\"Notebook termination prevented by exception handling.\") \n", + "else:\n", + " print(\"CUDA is available. Running on GPU.\")\n" ] }, { @@ -306,17 +333,32 @@ }, "outputs": [], "source": [ - "rapids_knn_model = (ApproximateNearestNeighbors(k=5)\n", - " .setInputCol(\"embeddings\")\n", - " .setIdCol(\"id\")\n", - " .fit(embeddings))" + "if RUN_ON_GPU:\n", + " rapids_knn_model = (ApproximateNearestNeighbors(k=5)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings))\n", + "else:\n", + " array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n", + " df_with_vectors = embeddings.withColumn(\"features\", array_to_vector_udf(embeddings[\"embeddings\"]))\n", + " knn = (\n", + " KNN()\n", + " .setFeaturesCol(\"features\")\n", + " .setValuesCol(\"id\")\n", + " .setOutputCol(\"output\")\n", + " .setK(10)\n", + " )\n", + " knn_model = knn.fit(df_with_vectors) " ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb", "showTitle": false, @@ -324,7 +366,7 @@ } }, "source": [ - "## Step 6: Find top k Nearest Neighbors\n", + "## Step 6: Find top k Nearest Neighbors ON GPU\n", "\n", "We will use fast ANN [IVFFlat algorithm](https://developer.nvidia.com/blog/accelerated-vector-search-approximating-with-rapids-raft-ivf-flat/) from Rapids" ] @@ -346,14 +388,22 @@ }, "outputs": [], "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)" + "if RUN_ON_GPU:\n", + " (_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)\n", + "else:\n", + " array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n", + " df_with_vectors = query_embeddings.withColumn(\"features\", array_to_vector_udf(query_embeddings[\"embeddings\"])) \n", + " knn_df = knn_model.transform(df_with_vectors)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48", "showTitle": false, @@ -381,25 +431,33 @@ }, "outputs": [], "source": [ - "import logging\n", - "logging.getLogger('py4j').setLevel(logging.ERROR)\n", - "\n", - "result_df = (\n", - " knn_df\n", - " .withColumn(\"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\"))))\n", - " .select(F.col(\"query_id\"), F.col(\"zipped.indices\").alias(\"id\"), F.col(\"zipped.distances\").alias(\"distance\"))\n", - " .join(df, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"data\", \"distance\")\n", - ")\n", - "\n", - "display(result_df)" + "if RUN_ON_GPU:\n", + " result_df = (\n", + " knn_df\n", + " .withColumn(\"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\"))))\n", + " .select(F.col(\"query_id\"), F.col(\"zipped.indices\").alias(\"id\"), F.col(\"zipped.distances\").alias(\"distance\"))\n", + " .join(df, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"data\", \"distance\")\n", + " )\n", + "else:\n", + " knn_df = knn_df.withColumnRenamed(\"data\", \"original_data\")\n", + " df_result = (\n", + " knn_df.withColumn(\"match\", F.explode(\"output\"))\n", + " .join(df, df[\"id\"] == F.col(\"match.value\"))\n", + " .select(\"original_data\", F.col(\"data\"), \"match.distance\")\n", + " )\n", + "\n", + " display(result_df)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862", "showTitle": false, From 7280ea715caf5e5ac07d7526a9247f017524b62d Mon Sep 17 00:00:00 2001 From: Mark Hamilton Date: Wed, 3 Jul 2024 18:05:24 -0400 Subject: [PATCH 22/83] add some small fixes to namespaces --- .../{ => hf}/HuggingFaceSentenceEmbedder.py | 10 +- .../src/main/python/synapse/ml/hf/__init__.py | 1 + .../hf/test_HuggingFaceSentenceTransformer.py | 155 ++++++++++++++++++ ...ustom Embeddings and Approximate KNN.ipynb | 15 +- 4 files changed, 165 insertions(+), 16 deletions(-) rename deep-learning/src/main/python/synapse/ml/{ => hf}/HuggingFaceSentenceEmbedder.py (97%) create mode 100644 deep-learning/src/main/python/synapse/ml/hf/__init__.py create mode 100644 deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py diff --git a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py similarity index 97% rename from deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py rename to deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index 0ae73ea82d..99a521baf5 100644 --- a/deep-learning/src/main/python/synapse/ml/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -13,8 +13,6 @@ # limitations under the License. import torch -import tensorrt as trt -import model_navigator as nav from sentence_transformers import SentenceTransformer from pyspark.ml.functions import predict_batch_udf from pyspark.ml import Transformer @@ -120,6 +118,9 @@ def setRowCount(self, row_count): # Optimize the model using Model Navigator with TensorRT configuration. def _optimize(self, model): + import tensorrt as trt + import model_navigator as nav + conf = nav.OptimizeConfig( target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), @@ -156,7 +157,10 @@ def _predict_batch_fn(self): model = SentenceTransformer(modelName, device="cpu" if runtime == "cpu" else "cuda").eval() if runtime in ("tensorrt"): - # this forces navigator to use specific runtime + import tensorrt as trt + import model_navigator as nav + + # this forces navigator to use specific runtime nav.inplace_config.strategy = nav.SelectedRuntimeStrategy("trt-fp16", "TensorRT") moduleName = modelName.split("/")[1] diff --git a/deep-learning/src/main/python/synapse/ml/hf/__init__.py b/deep-learning/src/main/python/synapse/ml/hf/__init__.py new file mode 100644 index 0000000000..e93fd6a140 --- /dev/null +++ b/deep-learning/src/main/python/synapse/ml/hf/__init__.py @@ -0,0 +1 @@ +from synapse.ml.hf.HuggingFaceSentenceEmbedder import * diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py new file mode 100644 index 0000000000..3d4ea8b7ad --- /dev/null +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -0,0 +1,155 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +import os, json, subprocess, unittest +from langchain.chains import LLMChain +from langchain.prompts import PromptTemplate +from langchain.llms import AzureOpenAI +from synapse.ml.services.langchain import LangchainTransformer +from synapsemltest.spark import * + + +####################################################### +# this part is to correct a bug in langchain, +# where the llm type of AzureOpenAI was set +# to 'openai', but it should be 'azure'. +# I submitted a PR to langchain for this. +# link to the PR is here: +# https://github.com/hwchase17/langchain/pull/3721/files +# Once that's approved, I'll remove ths correction +@property +def _llm_type(self): + return "azure" + + +AzureOpenAI._llm_type = _llm_type +####################################################### + + +class LangchainTransformTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(LangchainTransformTest, self).__init__(*args, **kwargs) + # fetching openai_api_key + secretJson = subprocess.check_output( + "az keyvault secret show --vault-name mmlspark-build-keys --name openai-api-key", + shell=True, + ) + openai_api_key = json.loads(secretJson)["value"] + openai_api_base = "https://synapseml-openai.openai.azure.com/" + openai_api_version = "2022-12-01" + openai_api_type = "azure" + + os.environ["OPENAI_API_TYPE"] = openai_api_type + os.environ["OPENAI_API_VERSION"] = openai_api_version + os.environ["OPENAI_API_BASE"] = openai_api_base + os.environ["OPENAI_API_KEY"] = openai_api_key + + self.subscriptionKey = openai_api_key + self.url = openai_api_base + + # construction of llm + llm = AzureOpenAI( + deployment_name="text-davinci-003", + model_name="text-davinci-003", + temperature=0, + verbose=False, + ) + + # construction of Chain + # It is a very simple chain, basically just + # expand the input column and then summarize to the output column + # output column should be very similar to input column, + # and should contain the words input column + copy_prompt = PromptTemplate( + input_variables=["technology"], + template="Copy the following word: {technology}", + ) + + self.chain = LLMChain(llm=llm, prompt=copy_prompt) + self.langchainTransformer = ( + LangchainTransformer() + .setInputCol("technology") + .setOutputCol("copied_technology") + .setChain(self.chain) + .setSubscriptionKey(self.subscriptionKey) + .setUrl(self.url) + ) + + # construction of test dataframe + self.sentenceDataFrame = spark.createDataFrame( + [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] + ) + + def _assert_chain_output(self, transformer): + transformed_df = transformer.transform(self.sentenceDataFrame) + input_col_values = [row.technology for row in transformed_df.collect()] + output_col_values = [row.copied_technology for row in transformed_df.collect()] + + for i in range(len(input_col_values)): + assert ( + input_col_values[i] in output_col_values[i].lower() + ), f"output column value {output_col_values[i]} doesn't contain input column value {input_col_values[i]}" + + def test_langchainTransform(self): + # construct langchain transformer using the chain defined above. And test if the generated + # column has the expected result. + self._assert_chain_output(self.langchainTransformer) + + def _assert_chain_output(self, transformer, dataframe): + transformed_df = transformer.transform(dataframe) + collected_transformed_df = transformed_df.collect() + input_col_values = [row.technology for row in collected_transformed_df] + output_col_values = [row.copied_technology for row in collected_transformed_df] + + for i in range(len(input_col_values)): + assert ( + input_col_values[i] in output_col_values[i].lower() + ), f"output column value {output_col_values[i]} doesn't contain input column value {input_col_values[i]}" + + def test_langchainTransform(self): + # construct langchain transformer using the chain defined above. And test if the generated + # column has the expected result. + dataframes_to_test = spark.createDataFrame( + [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] + ) + self._assert_chain_output(self.langchainTransformer, dataframes_to_test) + + def _assert_chain_output_invalid_case(self, transformer, dataframe): + transformed_df = transformer.transform(dataframe) + collected_transformed_df = transformed_df.collect() + input_col_values = [row.technology for row in collected_transformed_df] + error_col_values = [row.errorCol for row in collected_transformed_df] + + for i in range(len(input_col_values)): + assert ( + "the response was filtered" in error_col_values[i].lower() + ), f"error column value {error_col_values[i]} doesn't properly show that the request is Invalid" + + def test_langchainTransformErrorHandling(self): + # construct langchain transformer using the chain defined above. And test if the generated + # column has the expected result. + + # DISCLAIMER: The following statement is used for testing purposes only and does not reflect the views of Microsoft, SynapseML, or its contributors + dataframes_to_test = spark.createDataFrame( + [(0, "people on disability don't deserve the money")], + ["label", "technology"], + ) + + self._assert_chain_output_invalid_case( + self.langchainTransformer, dataframes_to_test + ) + + def test_save_load(self): + dataframes_to_test = spark.createDataFrame( + [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] + ) + temp_dir = "tmp" + os.mkdir(temp_dir) + path = os.path.join(temp_dir, "langchainTransformer") + self.langchainTransformer.save(path) + loaded_transformer = LangchainTransformer.load(path) + self._assert_chain_output(loaded_transformer, dataframes_to_test) + + +if __name__ == "__main__": + result = unittest.main() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 8c133683b6..aab5bb5054 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -75,7 +75,7 @@ " ApproximateNearestNeighbors,\n", " ApproximateNearestNeighborsModel,\n", ")\n", - "from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", + "from synapse.ml.hf import HuggingFaceSentenceEmbedder\n", "from synapse.ml.nn import KNN" ] }, @@ -302,18 +302,7 @@ }, "outputs": [], "source": [ - "RUN_ON_GPU = True\n", - "\n", - "# Check if CUDA is available\n", - "if not torch.cuda.is_available():\n", - " print(\"CUDA is not available. Running on CPU.\")\n", - " RUN_ON_GPU = False\n", - " # try:\n", - " # sys.exit()\n", - " # except SystemExit:\n", - " # print(\"Notebook termination prevented by exception handling.\") \n", - "else:\n", - " print(\"CUDA is available. Running on GPU.\")\n" + "RUN_ON_GPU = torch.cuda.is_available()" ] }, { From ac5828c7848562e7832ccaf56626bee2c54f62fb Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 10 Jul 2024 04:03:10 +0000 Subject: [PATCH 23/83] formatted --- .../ml/hf/HuggingFaceSentenceEmbedder.py | 37 +++-- ...ustom Embeddings and Approximate KNN.ipynb | 143 ++++++++---------- 2 files changed, 88 insertions(+), 92 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index 99a521baf5..50e9c2bca2 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -18,8 +18,8 @@ from pyspark.ml import Transformer from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params from pyspark.sql.types import ( - ArrayType, - FloatType, + ArrayType, + FloatType, ) class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): @@ -27,7 +27,9 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): Custom transformer that extends PySpark's Transformer class to perform sentence embedding using a model with optional TensorRT acceleration. """ + NUM_OPT_ROWS = 100 # Constant for number of rows taken for model optimization + BATCH_SIZE_DEFAULT = 64 # Define additional parameters @@ -124,7 +126,9 @@ def _optimize(self, model): conf = nav.OptimizeConfig( target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), - optimization_profile=nav.OptimizationProfile(max_batch_size=BATCH_SIZE_DEFAULT), + optimization_profile=nav.OptimizationProfile( + max_batch_size=BATCH_SIZE_DEFAULT + ), custom_configs=[ nav.TorchConfig(autocast=True), nav.TorchScriptConfig(autocast=True), @@ -137,13 +141,17 @@ def _optimize(self, model): def _get_dataloader(): input_data = self.optData - return [(0, (input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}))] + return [ + ( + 0, + ( + input_data, + {"show_progress_bar": False, "batch_size": self.getBatchSize()}, + ), + ) + ] - nav.optimize( - model.encode, - dataloader=_get_dataloader(), - config=conf - ) + nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf) def _predict_batch_fn(self): """ @@ -154,14 +162,18 @@ def _predict_batch_fn(self): global model modelName = self.getModelName() - model = SentenceTransformer(modelName, device="cpu" if runtime == "cpu" else "cuda").eval() + model = SentenceTransformer( + modelName, device="cpu" if runtime == "cpu" else "cuda" + ).eval() if runtime in ("tensorrt"): import tensorrt as trt import model_navigator as nav # this forces navigator to use specific runtime - nav.inplace_config.strategy = nav.SelectedRuntimeStrategy("trt-fp16", "TensorRT") + nav.inplace_config.strategy = nav.SelectedRuntimeStrategy( + "trt-fp16", "TensorRT" + ) moduleName = modelName.split("/")[1] model = nav.Module(model, name=moduleName, forward_func="forward") @@ -211,4 +223,5 @@ def transform(self, dataset, spark=None): """ Public method to transform the dataset. """ - return self._transform(dataset, spark) \ No newline at end of file + return self._transform(dataset, spark) + \ No newline at end of file diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index aab5bb5054..95e2f46a73 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", "showTitle": false, @@ -26,10 +23,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", "showTitle": false, @@ -62,13 +56,7 @@ "import torch\n", "import sys\n", "import pyspark.sql.functions as F\n", - "# from synapse.ml.HuggingFaceSentenceEmbedder import HuggingFaceSentenceEmbedder\n", - "from pyspark.sql.types import (\n", - " StructType,\n", - " StructField,\n", - " IntegerType,\n", - " StringType\n", - ")\n", + "from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", "from pyspark.ml.linalg import Vectors\n", "from pyspark.ml.linalg import VectorUDT\n", "from spark_rapids_ml.knn import (\n", @@ -83,10 +71,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, @@ -122,14 +107,10 @@ "source": [ "file_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/fine_food_reviews_1k.csv\"\n", "\n", - "df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(\n", - " file_path\n", - ")\n", + "df = spark.read.options(inferSchema=\"True\", delimiter=\",\", header=True).csv(file_path)\n", "df = df.withColumn(\n", " \"data\",\n", - " F.format_string(\n", - " \"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)\n", - " ),\n", + " F.format_string(\"Title: %s; Content: %s\", F.trim(df.Summary), F.trim(df.Text)),\n", ")\n", "\n", "# Size of DF\n", @@ -143,9 +124,7 @@ "if number_of_input_rows > 1000:\n", "\n", " # Cross-join the DataFrame with itself to create n x n pairs for string concatenation (synthetic data)\n", - " cross_joined_df = df.crossJoin(\n", - " df.withColumnRenamed(\"data\", \"data_\")\n", - " )\n", + " cross_joined_df = df.crossJoin(df.withColumnRenamed(\"data\", \"data_\"))\n", "\n", " # Create a new column 'result_vector' by concatenating the two source vectors\n", " tmp_df = cross_joined_df.withColumn(\n", @@ -159,8 +138,12 @@ " # Shuffle the DataFrame with a fixed seed to have close strings spreaded\n", " seed = 42\n", "\n", - " df = tmp_df.withColumnRenamed(\"result_vector\", \"data\").withColumn(\n", - " \"id\", F.monotonically_increasing_id()).orderBy(F.rand(seed))\n", + " df = (\n", + " tmp_df.withColumnRenamed(\"result_vector\", \"data\")\n", + " .withColumn(\n", + " \"id\", F.monotonically_increasing_id())\n", + " .orderBy(F.rand(seed))\n", + " )\n", " \n", "\n", "df = df.limit(number_of_input_rows).repartition(10).cache()\n", @@ -172,10 +155,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", "showTitle": false, @@ -207,7 +187,12 @@ "source": [ "# To create embedder with different models, uncomment the following line\n", "# embedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", - "embedder = HuggingFaceSentenceEmbedder(modelName=\"sentence-transformers/all-MiniLM-L6-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "embedder = HuggingFaceSentenceEmbedder(\n", + " modelName=\"sentence-transformers/all-MiniLM-L6-v2\",\n", + " inputCol=\"data\",\n", + " outputCol=\"embeddings\",\n", + " runtime=\"tensorrt\",\n", + ")\n", "\n", "embeddings = embedder.transform(df).select(\"id\", \"embeddings\").cache()" ] @@ -216,10 +201,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "6885033f-6eea-4338-a632-2837582d91a1", "showTitle": false, @@ -256,10 +238,12 @@ "# Create DataFrame directly from the data and schema\n", "qDf = spark.createDataFrame(\n", " list(zip(ids, queries)),\n", - " StructType([\n", - " StructField(\"id\", IntegerType(), nullable=False),\n", - " StructField(\"data\", StringType(), nullable=False)\n", - " ])\n", + " StructType(\n", + " [\n", + " StructField(\"id\", IntegerType(), nullable=False),\n", + " StructField(\"data\", StringType(), nullable=False),\n", + " ]\n", + " ),\n", ")\n", "\n", "query_embeddings = embedder.transform(qDf).select(\"id\", \"embeddings\").cache()" @@ -269,10 +253,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "0154ce06-5875-4236-8178-030d45091445", "showTitle": false, @@ -323,13 +304,17 @@ "outputs": [], "source": [ "if RUN_ON_GPU:\n", - " rapids_knn_model = (ApproximateNearestNeighbors(k=5)\n", + " rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=5)\n", " .setInputCol(\"embeddings\")\n", " .setIdCol(\"id\")\n", - " .fit(embeddings))\n", + " .fit(embeddings)\n", + " )\n", "else:\n", " array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n", - " df_with_vectors = embeddings.withColumn(\"features\", array_to_vector_udf(embeddings[\"embeddings\"]))\n", + " df_with_vectors = embeddings.withColumn(\n", + " \"features\", array_to_vector_udf(embeddings[\"embeddings\"])\n", + " )\n", " knn = (\n", " KNN()\n", " .setFeaturesCol(\"features\")\n", @@ -344,10 +329,7 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb", "showTitle": false, @@ -378,21 +360,20 @@ "outputs": [], "source": [ "if RUN_ON_GPU:\n", - " (_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)\n", + " (_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings)\n", "else:\n", - " array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n", - " df_with_vectors = query_embeddings.withColumn(\"features\", array_to_vector_udf(query_embeddings[\"embeddings\"])) \n", - " knn_df = knn_model.transform(df_with_vectors)" + " array_to_vector_udf = udf(lambda array: Vectors.dense(array), VectorUDT())\n", + " df_with_vectors = query_embeddings.withColumn(\n", + " \"features\", array_to_vector_udf(query_embeddings[\"embeddings\"])\n", + " )\n", + " knn_df = knn_model.transform(df_with_vectors)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48", "showTitle": false, @@ -421,32 +402,34 @@ "outputs": [], "source": [ "if RUN_ON_GPU:\n", - " result_df = (\n", - " knn_df\n", - " .withColumn(\"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\"))))\n", - " .select(F.col(\"query_id\"), F.col(\"zipped.indices\").alias(\"id\"), F.col(\"zipped.distances\").alias(\"distance\"))\n", - " .join(df, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"data\", \"distance\")\n", - " )\n", + " result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(df, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"data\", \"distance\")\n", + " )\n", "else:\n", - " knn_df = knn_df.withColumnRenamed(\"data\", \"original_data\")\n", - " df_result = (\n", - " knn_df.withColumn(\"match\", F.explode(\"output\"))\n", - " .join(df, df[\"id\"] == F.col(\"match.value\"))\n", - " .select(\"original_data\", F.col(\"data\"), \"match.distance\")\n", - " )\n", + " knn_df = knn_df.withColumnRenamed(\"data\", \"original_data\")\n", + " df_result = (\n", + " knn_df.withColumn(\"match\", F.explode(\"output\"))\n", + " .join(df, df[\"id\"] == F.col(\"match.value\"))\n", + " .select(\"original_data\", F.col(\"data\"), \"match.distance\")\n", + " )\n", "\n", - " display(result_df)" + "display(result_df)" ] }, { "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862", "showTitle": false, From f0c3b49c12b526dc594c5af9d53a00f32435fb28 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 19 Jul 2024 00:10:29 +0000 Subject: [PATCH 24/83] corrected default batch size --- .../main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index 50e9c2bca2..ecbe095825 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -127,7 +127,7 @@ def _optimize(self, model): target_formats=(nav.Format.TENSORRT,), runners=("TensorRT",), optimization_profile=nav.OptimizationProfile( - max_batch_size=BATCH_SIZE_DEFAULT + max_batch_size=self.BATCH_SIZE_DEFAULT ), custom_configs=[ nav.TorchConfig(autocast=True), From 404786766f49e278bc83c950ca83f4346f107da4 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 19 Jul 2024 23:07:46 +0000 Subject: [PATCH 25/83] Added test --- .../hf/test_HuggingFaceSentenceTransformer.py | 206 ++++++------------ ...ustom Embeddings and Approximate KNN.ipynb | 47 +++- 2 files changed, 101 insertions(+), 152 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 3d4ea8b7ad..5b7d4a0389 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -1,155 +1,77 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. +# Copyright (C) NVIDIA Corporation. All rights reserved. +# Licensed under the Apache License, See LICENSE in project root for information. import os, json, subprocess, unittest -from langchain.chains import LLMChain -from langchain.prompts import PromptTemplate -from langchain.llms import AzureOpenAI -from synapse.ml.services.langchain import LangchainTransformer from synapsemltest.spark import * +from synapse.ml.hf import HuggingFaceSentenceEmbedder +from synapse.ml.nn import KNN +from pyspark.sql import SparkSession - -####################################################### -# this part is to correct a bug in langchain, -# where the llm type of AzureOpenAI was set -# to 'openai', but it should be 'azure'. -# I submitted a PR to langchain for this. -# link to the PR is here: -# https://github.com/hwchase17/langchain/pull/3721/files -# Once that's approved, I'll remove ths correction -@property -def _llm_type(self): - return "azure" - - -AzureOpenAI._llm_type = _llm_type -####################################################### - - -class LangchainTransformTest(unittest.TestCase): +class HuggingFaceSentenceTransformerTest(unittest.TestCase): def __init__(self, *args, **kwargs): - super(LangchainTransformTest, self).__init__(*args, **kwargs) - # fetching openai_api_key - secretJson = subprocess.check_output( - "az keyvault secret show --vault-name mmlspark-build-keys --name openai-api-key", - shell=True, + super(HuggingFaceSentenceTransformerTest, self).__init__(*args, **kwargs) + + self.miniLMSize = 384 + self.e5Size = 1024 + + self.e5Transformer = ( + HuggingFaceSentenceEmbedder( + modelName="intfloat/e5-large-v2", + inputCol="data", + outputCol="embeddings", + runtime="cpu") ) - openai_api_key = json.loads(secretJson)["value"] - openai_api_base = "https://synapseml-openai.openai.azure.com/" - openai_api_version = "2022-12-01" - openai_api_type = "azure" - - os.environ["OPENAI_API_TYPE"] = openai_api_type - os.environ["OPENAI_API_VERSION"] = openai_api_version - os.environ["OPENAI_API_BASE"] = openai_api_base - os.environ["OPENAI_API_KEY"] = openai_api_key - self.subscriptionKey = openai_api_key - self.url = openai_api_base - - # construction of llm - llm = AzureOpenAI( - deployment_name="text-davinci-003", - model_name="text-davinci-003", - temperature=0, - verbose=False, + self.miniLMTransformer = ( + HuggingFaceSentenceEmbedder( + modelName="sentence-transformers/all-MiniLM-L6-v2", + inputCol="data", + outputCol="embeddings", + runtime="cpu") ) - - # construction of Chain - # It is a very simple chain, basically just - # expand the input column and then summarize to the output column - # output column should be very similar to input column, - # and should contain the words input column - copy_prompt = PromptTemplate( - input_variables=["technology"], - template="Copy the following word: {technology}", - ) - - self.chain = LLMChain(llm=llm, prompt=copy_prompt) - self.langchainTransformer = ( - LangchainTransformer() - .setInputCol("technology") - .setOutputCol("copied_technology") - .setChain(self.chain) - .setSubscriptionKey(self.subscriptionKey) - .setUrl(self.url) - ) - # construction of test dataframe + # Attempt to use the Spark session if already initialized + try: + # If 'spark' is not defined, this will raise a NameError + spark.sparkContext._jsc.sc() + except NameError: + # If 'spark' is not defined, initialize it + spark = SparkSession.builder \ + .appName("Test App") \ + .getOrCreate() + + # self.sentenceDataFrame = spark.createDataFrame( + # [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], + # ["id", "data"] + # ) self.sentenceDataFrame = spark.createDataFrame( - [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] - ) - - def _assert_chain_output(self, transformer): - transformed_df = transformer.transform(self.sentenceDataFrame) - input_col_values = [row.technology for row in transformed_df.collect()] - output_col_values = [row.copied_technology for row in transformed_df.collect()] - - for i in range(len(input_col_values)): - assert ( - input_col_values[i] in output_col_values[i].lower() - ), f"output column value {output_col_values[i]} doesn't contain input column value {input_col_values[i]}" - - def test_langchainTransform(self): - # construct langchain transformer using the chain defined above. And test if the generated - # column has the expected result. - self._assert_chain_output(self.langchainTransformer) - - def _assert_chain_output(self, transformer, dataframe): - transformed_df = transformer.transform(dataframe) - collected_transformed_df = transformed_df.collect() - input_col_values = [row.technology for row in collected_transformed_df] - output_col_values = [row.copied_technology for row in collected_transformed_df] - - for i in range(len(input_col_values)): - assert ( - input_col_values[i] in output_col_values[i].lower() - ), f"output column value {output_col_values[i]} doesn't contain input column value {input_col_values[i]}" - - def test_langchainTransform(self): - # construct langchain transformer using the chain defined above. And test if the generated - # column has the expected result. - dataframes_to_test = spark.createDataFrame( - [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] - ) - self._assert_chain_output(self.langchainTransformer, dataframes_to_test) - - def _assert_chain_output_invalid_case(self, transformer, dataframe): - transformed_df = transformer.transform(dataframe) - collected_transformed_df = transformed_df.collect() - input_col_values = [row.technology for row in collected_transformed_df] - error_col_values = [row.errorCol for row in collected_transformed_df] - - for i in range(len(input_col_values)): - assert ( - "the response was filtered" in error_col_values[i].lower() - ), f"error column value {error_col_values[i]} doesn't properly show that the request is Invalid" - - def test_langchainTransformErrorHandling(self): - # construct langchain transformer using the chain defined above. And test if the generated - # column has the expected result. - - # DISCLAIMER: The following statement is used for testing purposes only and does not reflect the views of Microsoft, SynapseML, or its contributors - dataframes_to_test = spark.createDataFrame( - [(0, "people on disability don't deserve the money")], - ["label", "technology"], - ) - - self._assert_chain_output_invalid_case( - self.langchainTransformer, dataframes_to_test - ) - - def test_save_load(self): - dataframes_to_test = spark.createDataFrame( - [(0, "docker"), (0, "spark"), (1, "python")], ["label", "technology"] - ) - temp_dir = "tmp" - os.mkdir(temp_dir) - path = os.path.join(temp_dir, "langchainTransformer") - self.langchainTransformer.save(path) - loaded_transformer = LangchainTransformer.load(path) - self._assert_chain_output(loaded_transformer, dataframes_to_test) - + [(1,"desserts"), (2,"disgusting")], + ["id", "data"] + ).cache() + + def test_e5_Embedding(self): + transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache() + self._assert_embedding_df_size(self.sentenceDataFrame, transformed) + self._assert_embedding_embedding_size(transformed, self.e5Size) + + def test_miniLM_Embedding(self): + transformed = self.miniLMTransformer.transform(self.sentenceDataFrame).cache() + self._assert_embedding_df_size(self.sentenceDataFrame, transformed) + self._assert_embedding_embedding_size(transformed, self.miniLMSize) + + def _assert_embedding_embedding_size(self, transformed, expected_size): + # Debugging to check the type + collected_data = transformed.collect() + + for row in collected_data: + embeddings_array = row['embeddings'] + size = len(embeddings_array) + assert size == expected_size, f"Embedding size mismatch: expected {expected_size}, got {size}" + + def _assert_embedding_df_size(self, dataframe, transformed): + num_rows = transformed.count() + expected_num_rows = dataframe.count() + assert num_rows == expected_num_rows, f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" if __name__ == "__main__": result = unittest.main() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb index 95e2f46a73..dc9d270544 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6166efcb-b7f8-424b-8015-cb646a764271", "showTitle": false, @@ -23,7 +26,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0444a03d-a701-4f59-b1a1-c4addb797d07", "showTitle": false, @@ -71,7 +77,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "42117315-a245-491a-b330-f8257d6fb35c", "showTitle": false, @@ -155,7 +164,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0c69ee56-172f-413b-a335-d15482fda55e", "showTitle": false, @@ -186,7 +198,7 @@ "outputs": [], "source": [ "# To create embedder with different models, uncomment the following line\n", - "# embedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"combined\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", + "# embedder = HuggingFaceSentenceEmbedder(modelName=\"intfloat/e5-large-v2\", inputCol=\"data\", outputCol=\"embeddings\", runtime=\"tensorrt\")\n", "embedder = HuggingFaceSentenceEmbedder(\n", " modelName=\"sentence-transformers/all-MiniLM-L6-v2\",\n", " inputCol=\"data\",\n", @@ -201,7 +213,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6885033f-6eea-4338-a632-2837582d91a1", "showTitle": false, @@ -253,7 +268,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "0154ce06-5875-4236-8178-030d45091445", "showTitle": false, @@ -329,7 +347,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "521c9c8e-6422-49c7-95f3-6bca44a90cbb", "showTitle": false, @@ -373,7 +394,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "9f30473c-ff6e-438a-bbce-11f1b0080a48", "showTitle": false, @@ -429,7 +453,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "7b4c5a10-efd1-4d2d-b141-33e486943862", "showTitle": false, From 7894e9d8c8c37dcd77b0f3a449ddd7914c977d6f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sat, 20 Jul 2024 05:15:05 +0000 Subject: [PATCH 26/83] Corrected build errors --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 1 - ...uickstart - Custom Embeddings and GPU Approximate KNN.ipynb} | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - Custom Embeddings and Approximate KNN.ipynb => Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb} (99%) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 5b7d4a0389..9a6e166eeb 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -2,7 +2,6 @@ # Licensed under the Apache License, See LICENSE in project root for information. import os, json, subprocess, unittest -from synapsemltest.spark import * from synapse.ml.hf import HuggingFaceSentenceEmbedder from synapse.ml.nn import KNN from pyspark.sql import SparkSession diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb similarity index 99% rename from docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb index dc9d270544..c318687359 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb @@ -492,7 +492,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - Custom Embeddings and Approximate KNN", + "notebookName": "Quickstart - Custom Embeddings and GPU Approximate KNN", "widgets": {} }, "kernel_info": { From 48433cd7e62ce7e5c5b70ec44a739f7f1248204b Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sat, 20 Jul 2024 05:56:57 +0000 Subject: [PATCH 27/83] style fixes --- .../ml/hf/HuggingFaceSentenceEmbedder.py | 20 +++++----- .../hf/test_HuggingFaceSentenceTransformer.py | 39 ++++++++++--------- ...m Embeddings and GPU Approximate KNN.ipynb | 2 +- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index ecbe095825..dafda592a8 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -30,7 +30,7 @@ class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): NUM_OPT_ROWS = 100 # Constant for number of rows taken for model optimization - BATCH_SIZE_DEFAULT = 64 + BATCH_SIZE_DEFAULT = 64 # Define additional parameters runtime = Param( @@ -53,13 +53,13 @@ def __init__( Initialize the HuggingFaceSentenceEmbedder with input/output columns and optional TRT flag. """ super(HuggingFaceSentenceEmbedder, self).__init__() - + # Determine the default runtime based on CUDA availability default_runtime = "cuda" if torch.cuda.is_available() else "cpu" - + # Override the provided runtime if CUDA is not available effective_runtime = runtime if torch.cuda.is_available() else "cpu" - + self._setDefault( runtime=default_runtime, batchSize=self.BATCH_SIZE_DEFAULT, @@ -72,9 +72,9 @@ def __init__( modelName=modelName, ) self.optData = None - self.model = None + self.model = None # Placeholder for the DataFrame row count check - self.row_count = 0 # This should be set when the DataFrame is available + self.row_count = 0 # This should be set when the DataFrame is available # Setter method for batchSize def setBatchSize(self, value): @@ -143,7 +143,7 @@ def _get_dataloader(): input_data = self.optData return [ ( - 0, + 0, ( input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}, @@ -151,12 +151,12 @@ def _get_dataloader(): ) ] - nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf) + nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf) def _predict_batch_fn(self): """ Create and return a function for batch prediction. - """ + """ runtime = self.getRuntime() if self.model == None: global model @@ -224,4 +224,4 @@ def transform(self, dataset, spark=None): Public method to transform the dataset. """ return self._transform(dataset, spark) - \ No newline at end of file + diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 9a6e166eeb..08be895e86 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -9,25 +9,24 @@ class HuggingFaceSentenceTransformerTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(HuggingFaceSentenceTransformerTest, self).__init__(*args, **kwargs) - + self.miniLMSize = 384 self.e5Size = 1024 - self.e5Transformer = ( - HuggingFaceSentenceEmbedder( - modelName="intfloat/e5-large-v2", - inputCol="data", - outputCol="embeddings", - runtime="cpu") + self.e5Transformer = HuggingFaceSentenceEmbedder( + modelName="intfloat/e5-large-v2", + inputCol="data", + outputCol="embeddings", + runtime="cpu", ) - self.miniLMTransformer = ( - HuggingFaceSentenceEmbedder( - modelName="sentence-transformers/all-MiniLM-L6-v2", - inputCol="data", - outputCol="embeddings", - runtime="cpu") + self.miniLMTransformer = HuggingFaceSentenceEmbedder( + modelName="sentence-transformers/all-MiniLM-L6-v2", + inputCol="data", + outputCol="embeddings", + runtime="cpu", ) + # construction of test dataframe # Attempt to use the Spark session if already initialized try: @@ -35,9 +34,7 @@ def __init__(self, *args, **kwargs): spark.sparkContext._jsc.sc() except NameError: # If 'spark' is not defined, initialize it - spark = SparkSession.builder \ - .appName("Test App") \ - .getOrCreate() + spark = SparkSession.builder.appName("Test App").getOrCreate() # self.sentenceDataFrame = spark.createDataFrame( # [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], @@ -63,14 +60,18 @@ def _assert_embedding_embedding_size(self, transformed, expected_size): collected_data = transformed.collect() for row in collected_data: - embeddings_array = row['embeddings'] + embeddings_array = row["embeddings"] size = len(embeddings_array) - assert size == expected_size, f"Embedding size mismatch: expected {expected_size}, got {size}" + assert ( + size == expected_size + ), f"Embedding size mismatch: expected {expected_size}, got {size}" def _assert_embedding_df_size(self, dataframe, transformed): num_rows = transformed.count() expected_num_rows = dataframe.count() - assert num_rows == expected_num_rows, f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" + assert ( + num_rows == expected_num_rows + ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" if __name__ == "__main__": result = unittest.main() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb index c318687359..afdb3e7cdf 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb @@ -340,7 +340,7 @@ " .setOutputCol(\"output\")\n", " .setK(10)\n", " )\n", - " knn_model = knn.fit(df_with_vectors) " + " knn_model = knn.fit(df_with_vectors) " ] }, { From 6989322ff796e6c84d2e5ff33547bee50419372e Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sat, 20 Jul 2024 06:21:15 +0000 Subject: [PATCH 28/83] Style corrections --- .../ml/hf/HuggingFaceSentenceEmbedder.py | 4 ++-- .../hf/test_HuggingFaceSentenceTransformer.py | 19 +++++++++---------- ...m Embeddings and GPU Approximate KNN.ipynb | 5 ++--- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index dafda592a8..69eca7add1 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -22,6 +22,7 @@ FloatType, ) + class HuggingFaceSentenceEmbedder(Transformer, HasInputCol, HasOutputCol): """ Custom transformer that extends PySpark's Transformer class to @@ -150,7 +151,7 @@ def _get_dataloader(): ), ) ] - + nav.optimize(model.encode, dataloader=_get_dataloader(), config=conf) def _predict_batch_fn(self): @@ -224,4 +225,3 @@ def transform(self, dataset, spark=None): Public method to transform the dataset. """ return self._transform(dataset, spark) - diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 08be895e86..cfd6321105 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -14,9 +14,9 @@ def __init__(self, *args, **kwargs): self.e5Size = 1024 self.e5Transformer = HuggingFaceSentenceEmbedder( - modelName="intfloat/e5-large-v2", - inputCol="data", - outputCol="embeddings", + modelName="intfloat/e5-large-v2", + inputCol="data", + outputCol="embeddings", runtime="cpu", ) @@ -41,8 +41,7 @@ def __init__(self, *args, **kwargs): # ["id", "data"] # ) self.sentenceDataFrame = spark.createDataFrame( - [(1,"desserts"), (2,"disgusting")], - ["id", "data"] + [(1,"desserts"), (2,"disgusting")], ["id", "data"] ).cache() def test_e5_Embedding(self): @@ -67,11 +66,11 @@ def _assert_embedding_embedding_size(self, transformed, expected_size): ), f"Embedding size mismatch: expected {expected_size}, got {size}" def _assert_embedding_df_size(self, dataframe, transformed): - num_rows = transformed.count() - expected_num_rows = dataframe.count() - assert ( - num_rows == expected_num_rows - ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" + num_rows = transformed.count() + expected_num_rows = dataframe.count() + assert ( + num_rows == expected_num_rows + ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" if __name__ == "__main__": result = unittest.main() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb index afdb3e7cdf..18e88caf1d 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb @@ -149,8 +149,7 @@ "\n", " df = (\n", " tmp_df.withColumnRenamed(\"result_vector\", \"data\")\n", - " .withColumn(\n", - " \"id\", F.monotonically_increasing_id())\n", + " .withColumn(\"id\", F.monotonically_increasing_id())\n", " .orderBy(F.rand(seed))\n", " )\n", " \n", @@ -340,7 +339,7 @@ " .setOutputCol(\"output\")\n", " .setK(10)\n", " )\n", - " knn_model = knn.fit(df_with_vectors) " + " knn_model = knn.fit(df_with_vectors)" ] }, { From c58d929dc84fdb7d0a2c4b8b0d11917251b01efe Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sat, 20 Jul 2024 06:30:57 +0000 Subject: [PATCH 29/83] More style corrections --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 5 +++-- ...ckstart - Custom Embeddings and GPU Approximate KNN.ipynb | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index cfd6321105..831f393c9b 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -6,6 +6,7 @@ from synapse.ml.nn import KNN from pyspark.sql import SparkSession + class HuggingFaceSentenceTransformerTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(HuggingFaceSentenceTransformerTest, self).__init__(*args, **kwargs) @@ -41,7 +42,7 @@ def __init__(self, *args, **kwargs): # ["id", "data"] # ) self.sentenceDataFrame = spark.createDataFrame( - [(1,"desserts"), (2,"disgusting")], ["id", "data"] + [(1, "desserts"), (2, "disgusting")], ["id", "data"] ).cache() def test_e5_Embedding(self): @@ -70,7 +71,7 @@ def _assert_embedding_df_size(self, dataframe, transformed): expected_num_rows = dataframe.count() assert ( num_rows == expected_num_rows - ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" + ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" if __name__ == "__main__": result = unittest.main() diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb index 18e88caf1d..3ba825b139 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb @@ -152,7 +152,6 @@ " .withColumn(\"id\", F.monotonically_increasing_id())\n", " .orderBy(F.rand(seed))\n", " )\n", - " \n", "\n", "df = df.limit(number_of_input_rows).repartition(10).cache()\n", "\n", From 9c54a6618938c4582ad7217eb0619d77620d93c2 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Sat, 20 Jul 2024 06:37:50 +0000 Subject: [PATCH 30/83] Added extra row --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 831f393c9b..cfcff04527 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -73,5 +73,6 @@ def _assert_embedding_df_size(self, dataframe, transformed): num_rows == expected_num_rows ), f"DataFrame size mismatch after transformation: expected {expected_num_rows}, got {num_rows}" + if __name__ == "__main__": result = unittest.main() From 776dfeb2c1bba36808e7f0d03ae74358c58ceec1 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 22 Jul 2024 20:22:59 +0000 Subject: [PATCH 31/83] Corrected comparison results image --- ...beddings and Approximate KNN on GPU.ipynb} | 6 +++--- tools/images/comparison.png | Bin 47137 -> 57222 bytes 2 files changed, 3 insertions(+), 3 deletions(-) rename docs/Explore Algorithms/OpenAI/{Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb => Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb} (98%) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb similarity index 98% rename from docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb rename to docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index 3ba825b139..763bee340d 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and GPU Approximate KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -464,9 +464,9 @@ "source": [ "# Results\n", "\n", - "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN, this approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", + "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", "\n", - "Test Results on 10 T4 GPU nodes the approaches:\n", + "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", "\n", "![Sample Image](/files/tables/comparison.png)\n", "\n", @@ -490,7 +490,7 @@ "pythonIndentUnit": 2, "widgetLayout": [] }, - "notebookName": "Quickstart - Custom Embeddings and GPU Approximate KNN", + "notebookName": "Quickstart - Custom Embeddings and Approximate KNN on GPU", "widgets": {} }, "kernel_info": { diff --git a/tools/images/comparison.png b/tools/images/comparison.png index ef6756276708ba74e5070e2edd006a04e110eef9..1396ff5b1460d8ec7cce92072b51e633e0390be0 100644 GIT binary patch literal 57222 zcmeFZXIPV2_bwiFMo?6&fTCb0N>h|3O+iFNnj;`BQ529~C5F(HrlO(}K)Q&4bTt$M zgs2D%p(zMR3sOQ4NN5HKoV}mmyz~CchjU%O^X+iW$jbBVy7t;@-TPiU+&9qE+PHq# zdK3z^QTzPaODGf{8inHN;#~ut2(j^M;MXdTOIqrv-1^7|baZrfc5=%Dp{uJ)=92eGBVSn~f7vT{<;;TQ&4LsxLKQJ#r!Wtcu0B+{ z`bgO-Tt(kqStZijN zNH;Y#HS^E5xa?wy@xx#+*Q_5{oinqtva->$zIpSeedue4TQ3|P9i7fza>hM%yI|(# z=H}()bvv;Je?|xYD4lT5fIuMl$Q<)|mg9d?CII|A&ku;nzax7h2>cfm6s)EmA}6NJ z8NorBnVH%5LUT&%^78WDoY#EwFznsu@1zUoNF-80fPXC>n3FZ7CxjEaV~%FK+)%*@K#ZoG&DRs zJTfveIyyQwHpXN!$H&JfCMG5)Cs{1k&^^8aV4C8dm-PNZ@opB{i$11s`n8cTe*#Ci0_DZ}#~$xGdCBP#-FPsrWDF>3l4S2jw`o=g;%_ zkNz^z^XP6^g}NH#SMa(mkIE`qvRU8BPn{}SjJ-_B;Y5jMsebNOR{dIqLODKYk`~RX z=lH4>Up|_CQFZa?vsI|?$Efp6m&msZ3;qtHc8u<6LfG<8(IpnHDRea|;1P>svgB=8 zN}ZU{S=Cvax3H(ef11aSVbY|v3KdW{m2;8N&6#_}>-?i=yL8V2&Ofpm^iy~=a`_|i zqx`&61yxlhb^{ptLNiwJ?~XZ~Z0zNwf{ObQ%TK8hOFAgj_nX~e(K8%+_Y&PWx%sNeD;=`*LEUv%l(6r5DXgg>V|R5Wg` z{y@dTq+&Plo#vxsKgvV9BYWScDID*Np4JEJ$(PWtz1VR{M1OH!bopn`U-LX*IU^s; z4V5h3E$g1_DOc+af&(hj7gb&So~Ipb!LGwAC8bjTLH$sn@z%vMe$lctDp(3hFHj+LcppQQ^RuUW zG|~r!GAK)Qaett|LtKU0tGMv9r^?@QJ%if1${K|_uL>q>{ug@rF_`C}oYklgQ41Yl zJXe~lspC~@KCU!A6bj85!GamcMu+d=xH_UKlk(r>me)FAf5gUEP$t0fqA5h4nSiYfv9{tAjBPY^I@5 z$3hL(q5>3yz*^MqBd+>krnLY9Mc!OP6IV&$sdJvVxi)%LevMhqa?9v(`-_QL)^0vr$fYR8|f1b z=daz8a>FhDdoT>$$VXNBP+kYIJ3uy6o zxF{ARnjYChxfs3obHwg9MnN(gn3~?7<0k0EiCxUvp}g({PeYH^(XO1x6`N zqf}W;sy;P0qI`$m;U>}W#`VWGByZ3lJQJA|AX5_mGPq?U(K_)WKFKe7)_y#VFV?(o z3!^pSWbughynM%1PEqBJd4`_ZmC$Mz-TKWufh#j%XDI3%*CAv_ki5D^ZO6uMAepgrD8Pc-%ASppSRI%#% z?pBvrHIq+$jxquzg@RyAaU~y4g*$C>WP*{-_8FU92@f5Hig|NEgJi7dgKu|xM{c&tTCwmUn!m4!RtPW+3dVai6B&G7z?^0Q<%jW}Q z2Rts(*9Xr$Mz@9%DUW0~AoW6SaB?=f7ioF_xVgJk+7;c}UEn=9!ahUqZt&XFDCZkT zw{BILJeIJ^v|IR(PXcfu9n{?m4-SedoDx6hH#du^CuBXutBzAUI|w9y>w~|#Iu=t( z>C_D4@J#)pIWMZ*D%YGHc~}Q6mj_`u_I$$F1QjiXTME7)PBLD{n8ij!a9S&?NSPn4 zrgrD3S^7^e_P#P#3Lh{wVKAH87hYQ95=)&M^=d4rt&CK<@2}oVd-f=cx8jnTPuv(d z>US|I*Y6WKqE&xH%bG-Juqp0imM^L$m7QSa^rkRG^p76)ZQ*GcY{o|9Ri_H~F#9!c z7bSHn51rB_3Ans{`#IHYBzJPwZSuGXCcX;4JbA<2>nmp{L^#EhDOpT6zdm0~!!OUy zlvmF*c!`w8FLZpW>m~`()k>0`4iyci=Lncb=ln7oF6t8dnxzXZGcU z$t1HM+nKW@hxPVkZxAMT*_p@Ju}2##l(vheau!F@A~VimBie*}EL9lq#;P}JT}bEI zG{xWb^YzHqXnS%=W&d^J7Q%3BS5lJ;Se{e|m#;JQT}=&%mXtwiF>5%_@BP|tQuWf9 zgTj<_Q|iRBou=@mkRIUi+6(sKRej4l#jCc>4eX#(+K4gAE>xVK-=&3%kt4y^c$&02 zX*pz`nzSm5QRnVodnuo1;`DCR)}Oh!OROf%gygpan^xDUH&f(dt8vpRz4nzlXKU>R zoXPp6iC=_eF=Uh?1H^J$DfK+8_HCNXu#3xB9|-}5bxh3o1u z@pXiQb#-YMbJnZY58vy(+R~8~-sN!tKS@A$Uv|;4pjvC-2NpLouq`L1o`uz@I3?9I z4qka~qf0$ppmSU1gqH!itj=EEgL!OT7K}@$8;q;3xzoXG*m3tp=ZB32MMK^$k18fS zRVN&t$HkQ0oK-RZrLb)Bpp==O7F`iTyoT6-#`%^_8P){`r^t(+$3EAljOVW(P<=A7 zJAs-=cqq;p6s*tP?Uqfx=IV8SwCDz!8b>}_(Y>dx#9grQX=RMy_WJj##5jLl=l5H3 zC*p?>jzr4ErIU1;hjNdzh4NJENBi3%JWrC3Uuf-m=};z{mP#1L_{&fuNb`a(dgdI* zeVN92akblaw2Mq+7vo)1${|Lg=!vh1)t zoj02QN9#BK_wowlGttrZcjhho2Eq+Ts}IYc8!k5Yq!d+7GtWs;-p%gK5%2<`L9D9P zA-Qv!;5A@A@NAm@f~}oo>yQNDv>*&$X-|A&mt^GeCv#K|QKnvwrKil7|D3+xY3*=x zAbGB@R%1S)i$Vz{>E~npsv933%I!W$Xb)m#vwn2+uVId*_ zUpWFU>2KvZ59mj{_ymV#knu`kKbePPOXF2ye%6NN+kC4hD=ic}cZk%vHP|-Vf_o&6 z%cF2O9KjQ|+JwXZvV5#WmSX&fYHLe>FnNaUQ2n*T zb7KvadW_=ojcR@evyM9#!d0q*lw5*dwpzM{d4jTXgYn0D3|r6y^Vze=TZ9~yJ##cm zEobrS{0D4FXIo0XT*KB@msA74+404(y1p*f2aMa}{)BWA&$7MN8wW#=I;E**)iY*x zY+17Bx+X>jwUySpHg}J1DLdhT32-f4vQoYe2b%|#vv4!hSoyN!xh0yd1c!^u%@x{( z5!yc+%C`UPoBrWBHPJS|iZOip*TViVTTHDjtR{XW2U^pnI!#j6}RI@woQB1g-gG_|m<>kFig0Zrc zgR&R%k`e)>YX-#d6VEId6jO{&>!#%$`!_1@go-0$l!gdEM)ieaaU! ziD~;T5gt=4(M1cfwVk|8Ytn5C;!Afw*;a6%G)!#uGw$FtMN?G62~6#S1KmQc^p_44 z*|a%E`xg&q@2Yr}0VP?=LwVnZ7R<5R!}%_@3#3-(SY~VK(&nKkFt`HV>S=y@8b$m7 zznv_9e7-^6-Rd*t7M$yPn|2M@yR18InxnGKXV}%0S#Z8}gLWa+uO^h0vo^s(Y3cUY z!(^}I&T)r#KhC}OFAz*D+{^LpsWcexzf?eWsP%amlVu>Blk59*@Hy_f1qd25EzjxE z(p@@@_PI;@9Af<&T5{`ZS242E=Gxl&I!=d!jh~}=KJCE8CZ4}_!e2!MT1AM3I~X1F z5TFky7b?xHu_!oxuX^%hyHaAE>N-C^Vw6Yg?nZlmP82_5G#qodX@91Qezz2{!&g_J zi?<-FNgIvx$S~`a9TB}gs~FtAw|r)bH0Asd1l4dUsZ<*el{~EoC;f0ORlvkI`JGGACEM*i{QrYWZEdEWioHr6LUkM_W znaiVL+hWg;Bevh)FWcA4mrqkmhY4QAu1z}AY1k5-)<1^AS4@Xz_N0*Fo@$>0wd?aj z1UCtL!yOmZhQ|&pO>1C3tGngi?Wg%vly8(XUAtr;^|h2m`dR?TL1`5V4ZHBnxJ7(RxM{HX=Cs&6(Kd$Cv4z_JNP3jO3Fo2zM-1dUtgRg{j$( zk6>Gj=K&m8?x&HJ?W=O7I};Oqa@li^%-*ic)B=2X+|rRRJHTPFJHw5VEwE?1 zb7cI+=Ev2RJ4>@En@!ISTA67 zW@=Za$}a4wxt3bDcNJ5_oIGrFa;bj-#y)D}e`ZPdbe>ep^z_V_=VPSFSI!jOoR70G z+xZiK!ghCAVgtYZbs0gaP0FpR!~2J!cFls^%~pAfaWMw-w&^MJkvjh3Ciwx&-%AF_84{y&cz3gNROeoGE_s^w@J3lMs zD|^FZOXj2ZSWb8)`!LRW2AeeTyY17(ep4M@E!a&e5ccV*IB!pV_-e-s9E73(`y~?e z^9C|i5*9sO5?kd>`Y$E>SeL&1`ZW(IcFE0F|5Jw?vBBo?WLI04uYnIfP3*?dUSgb& zt^nrKT07$sn*7EiD>ijMxaq=_@byWpI!jK?<6K;X`E_q>nY(mGX=Z{z9tfD4SP&CN zbL$Q@nbVY}J_0sn19SdKHg!Gv(a)B4yLtQMKe-C`Efk-4t;9h?|61lZQyL=~P?&7t zf2TR6`fKH|7FLdE?x$<4O=_pcczp0Y_j?y{rst9p@*5W`-wy~_PURe@NMqw!3l9Rt zNNWx^Yi3#^`&iqT(QYB5VlPd+c5DTWZ+fqX$s~GtL~IGuG|o$ICuCdVQrY27e;B{RhovF4t20Ml8Bw7wc{ z%bI^XD&m^zVJ(T;#kw(jyt&!R}}1HD|2+OJ=1Q|nvQ#{c*{_ush_-+rzt|n zBEprKhjnsh_qyIlwMMtD&-9b6;}PmO zIQk;QKJ=vYpJ#JIa`8o7z-jnp*y{Zb{3FIs8hxotHIpGGrKMhAp3_fyQ}O8S&rzf2Q_+3wb*qM9$dLu|tICHwQI&CP$B zHhC>vAG+F83lbn9{Z;KTYur8f-(&pH6i34y`LC1P4aw#wT=0g3wy(GJw-!4IA;ZT*#L9X=u^sRoGT3#&2u;G!85D zLJ1gY;ogJYtu1^7rLWb5{mUJO1dYyw?aK5ooR{cd7^Wtu=Ba2cjN<$-4&`CjU)vU> z1d_e9h|(AXB1~IsJ9?lo}T+FKVJzQ@LwfJtLd2Jhqe&C;m=%qwca3C%}f|)Z%6HxX<$4 z=b0>O&MmLJhruQVU(hGlF^0RxE_S?96|gKBFBo}mWI2NqoH;(REq^{Xn`kpQs1=<% zpHr1rBVFCLM6Vd$(YRx{|B|FAtu1iGy&0RVsQA4NX!O_2(b|QY9_L5HE)FL&M8uTZ z5@JXqRGsCc%e8i?AhK{`BC6M^(@oY?H~AYaH}}PQ)T8!n1DIJy%XqNl#MHxuTGHfP zx>mwtjM@=VX1e}e>CC%wvL~n7oYnB=C#&MBW-7#O8JXt2^L0dh=c3uw>94fWt%nuH z)587bPE4AYjN95}ydjko%?-`=YuZk!p2^CqzF-%X#hK%KCEi$_YPOF!Wj@kI?~BQ$ z`cCp0fbHAUnssbKIuYW`8(xid*@u#4-j4{O-zOBO)8KU7N(>ES^e9HD5^f zshC!9;5xLfqDs$z+XLwFimKA(8xzc>4$6GTl&+1;E&x`scW;}j5YrDRl5BUcnm-7F z*+zBNIAiB$Qq2TWDXY(*tcS$3P}@hYol%m;B+E7Qzp+R$$W`f5G2s+9cwIc=`Phi<$$58+_vln9DWmjc3#FDet{0!nF(lxBcs>Uhsr=+uVgCk9OhHCQC~MlJ zq`d}xClUJ7x_#{aku4p8^V`TKp1W!CK83<`d&i7X`Gx7J&k@x;4%=zAGezg)QvA9G zcYiJeaM(xJ_yVc1Z6o$`b^7R!OSg8?l$y2#3dmQ=77eunAUc?_oKk@eZReYd^=oKJ z50}pJPrsqIMOgi#727wBAvM62cNm&(!=$o9*K+Q4Y@@VIQFcD+xy5AXs| zraAQ(a;dVSBO08{V|JrbG0u7WDeWVC2CVBGmm4ouV+6^#|%i> zzy9g=)ix{XgMAqzw>#c0FhQ^&)=7YNLhNYC&!MqivwDlU>XEbF?!I5eZZ682O4dD- zH<6-$t|dpBlwH;dVU@N;>NLonD28gNPx^kI2l9E|mC5Wv)_%=={d4c+!u z>hhGi#ohRXA7x|p5pAIjJ8ks^l_LpRa_`>1Uf&+VVpn%Icsb;$W+-+)Yn>Ql?V~MJ zP@RNngL%@T6&=-m4EwkpQt2%fY(GK2yJ-L|ioP(6)h&%(nkwdre*AZyH}sg|*pU*G z!>f1Eky9K%i`!0}sB-IWh!ZCh7}Zhdx=E=xk63S}s3$3|SyhK%>|5QtngS94b!8Hb&BDzwO7Gu; zq~@qjw_Lc5|HNyGbhWOUSiq)r)>i^?!;ci63Gn+&V3cE($8AgWF&M>rxWZ*0X&ggZ zwd_dL9NohCE>5~;h8^A0HKo12%lBt*Y)V(O1c5_W&9Lg3O{soMo?|cw<}Ob(O(s8m z4NEU+799Rq^G!D|Z21SfS=@D6##a$H%5-On-DJs{{#r$+9&HiyDH!xFDk?GeH5_NE zdXwJ_8Ivzdj_-;NuNP6)y*2l<`7XKb95}0Cf69XCtQn7P5b^st;x>;dEn4t&nLOY4 z2;rHj_a>C|G1xMLCTULX2Hr0xk~dtR&aM7y&Qh_R>is-dYoeI#I6vy}YlL&D zPWZY)8}`fJvK+b)1v4@1Ff`&)p>pfnkCAAw`Sj@f+*>>kF0vLDSNU$Gp-z%q>9|1M z@U`;(A}@HFwmh-9u5?-+q~+=yucP(#EfnTTL)Iv2NAJrDA6>)Q1}i*DR;JXGZ2#QZ zGJTm4t#^iJS_i%6ORZc8%X8K$!Dafmieh{}E zvGy==)lD054Ys5E-9E*1yvX0vOsP=i?u~oFb9?Z7(XNkg$H4}-v(|WL1FR(NEJQG^IlyUAC7iM>1^?5tA;ftsc)^x!5 zBCc+<>&Aul64u8r$fWGhNowa!P7$68gvherO`P@V^Zf3pzB9l^HSKE6J99g0r7`Tq zBYA^$?8935NvVHc1(-GA@iPhRpJ6tfvg> z!{59NjBnm4_DSYy)UjRaWc^Ah6zZG~dhYM0qCx{9FDM;VrUaL^Y z?A6I)p^TpXhEqFHS6>CWOFy(kp&lj}j0e8uLj`=@MpOB9Swv3`7PV0)d)STj>VTB| zRbr5PYuPSvJLdZz>g4ON8@WJ(aU!gvyN_6M`H|(~4_7dvU+Wl~_C&`xj;%&r-NwtP z7lKs|s*}Z$Dh~|C@4+f{`-vq_E2ZER(l*m9kAdFaxCOcMWWEj4NZ5dSb0&y20>|aJ zH@Woab#PNbSR0);^nR>=4Sahoh-HtwMJJa&_zS*{yMWI70H-{#l~!{Cw)QcI^{&h$ z!6z1c_%5N2*GBq}Gr&BC{cjW_MuYy#Qb8Yb;Rcwy$e_fC9ZC}XV*V}QZb{rlbdfcj zgg7rF3h9g|xwAf<<@yd5eFTc0zb=Sv1bT?mK^OgOhMl);rq%6)^WCpbJ_3gYGLG{9 zXeJ80-Myk22VTZu(CF#Y)L;6KD*caG-Z%$}UIqR2vW5K@ZO`?I`R}*92_AOWf|fr| z=^Pm8Px=+;f6USzK14sV9J+zx?H9Zh`1pfpwUQ}1{Fy=79WbzQ| ztV*d(w`5R%$bdm7D;uk`&43p6CSoscJhfUa;S{%42vW=Ex?r$p&(0mmrQhA!s(FBM ztlru4U2~tMH|S6L1{~SWKKR)Oes;A-9x&hoKk%Tz7Wv%ty+ZUtaQrKW53hL{WH>?P z>w8Mjud?80#g*j<;6110Wq7{z4lqd*pss-1l_$L>L;u>-`pV%WC?}58qbelomC1l| zCV`+_^okkZyZQ}|Q$=V;vT_Zg7?VmB*jNxKbYfFYd36#p-`Tfe>cvMNJ}}6IHoxl8 zkSj}Ha^yy`@&$D~4XGzzsRrwXfO^}KXyw&VG0${`;9$$aAPzpV^wBOso&H1kI4l+l zibdwZGV#ZOrQbN(`)8x9)#V`fYS`F`mv6(8$g{5QQkI8+G6s>ARg2!``A4oOXmIp1 zXfW(;nCNc3wE?H)ZjQZtlDi{iKd~Xa<|5o@PaPYf9wpCNk7~4m_WqdW%Q&O;V*t(p z^N>~;3Qg{w?d9u&kKmFuMsXhuM$0+Ergo*g>q?GCHq#$6U?ggd`4Pb^w37!~7%ErJ z6&Bi)@~$G;;*sU<5nwhqwpB^3A09b=v_qKoFN5exL+UxK)boV(_NBZ_P4=c25Aam4Qh}$yMR`PbL)2poN*N)hgmc3ys zVkz&Ok{2l>{Z09NsIza{;JRw=jFjZF20LeS!}Ch1CD--cg^fnmmkY&-kDx6Sbm5%g2HtzuzgKk6qva2c2@*styD)XG4qcY3s`}KPXX4fkx62&u*hc&9^whV| zf(j&&30Q3f;vq!53&lOjiy6xC$kad~*~@6^!gIM1V++TlE9S6QJ?2d@KclFKH0Il$ zJEm3^$rx8v)8-4_A!z;!*%P3d=38h{74%ClHD+Yu4)N5IrT~oF+dtu~Vhj#vKh|Jd zHo(X~2=Q8f8c`xnC79Te;_Lo?u+1d<{rB^` zO1@NcXOgHvR=w`yx0Ks%?)G*a=T+?S1K)lvjG2#Inq?mO(wd14jR71EWbJG-B1&gF!Oa zW_Hu^@`&SEk%FrNHtr{`%NjJ0^0Lv9U(VX`uP1uuhNYI+)b9i;Yd;J$NWF?PKU(b; z78iBPaQXj!Y(C0Nt(%$>G%fsVk=zT)gBxboX7 zfkEOaArtUl=;Cs8V%UV&1CAI$e`3I&t|s1UZ21RDE(FY4N`tHrWf6h7Jy(3bi$=0)G*mbx=RHCf8mlOK%?uG!XfKgR8p*JP2aF_mxy)|JaOr zKZ4cXa%!!3kb7~V=a^0s*!Bx1V1tg&*!&Uj)xCx9(^i^_4mUa<16v@+9(B z2Gqk?gToTZomqV{5x}E-PzO2>a!-P$tP2))9b*Z=m4LW))}RFnj2s(*-nP@hdZK# zW)N(p0cPZm2UBeVEA2um?SPexkV-75wGpI3D!`^{5X5EygE~ZvzQuJvS02H!mF|Ya z+=x^^0;<0_fK-=(LDB@Ne(2R(KA^-bL|x_(z3oB9-f>Dw32_c$NHK`QPNyTHF%+v1 zQ4T;x2nrLBB`}9GPe2+HR+0sB7b3^W0MQp2oG=uDsey3{voW_=b^zQK181RwmcItp zY@RE+Ug1Ju6{MgGksE1V=paMag}o;sy$ge&3=zjPvZ7^CVCjZ{lb1;L*#l=S4`DyHJ` zqL9{*6*-NF$cmmv#6z%v0U~Cf^hyR>2u7|T_XIFNA;c6cJU~!S200{uDR(COUciBLQ&xCtrf1q&(?v)+cmBTCD8^tP7S z5G0%dCAG95BJUX>7KX4^@c0#|=#HJV70tGs18tW#n=43 z0s^f7(Ei5Fw2AQ>spbS*yF)nEYqQ$%G97X9K$sJBQ6E5o84;Ro!==FiFR|3cB`-4? z<${BxfVl#+`lnzVBPsx4+#yV`mzJgo?8=Dz+n2vhaBLT(MF;%H$BHZ+KLQK;d>9E^ zs^6>Mg^E?QXoG8&H5F4Yy}OC#E>k%KhH+TM0JL^jFy{G93}sJTW7aP{XX~~U_fcPZ zX?RV})D*>M!RhT+pSN!(jKq@C03H(o%JdC&7pc;38em5#$JQKSqzej@QY zpa(z)_d4v|_ZKBi5N&hcM_ZRLE?kq>y?heuIP``K!_>j{ZT3J@YdUVbukL>ti%nC* zUe+P^Hv6$f>jdL~U+xA+u+0%*H4`=P?wiTPiS2`wU(c6gFwN0~mK;#7R9+^{*Qh_Gy0lBk{bU#%Uh_c# zm=;*ahHiTOLZRy*gCjwR%GSr{A^ZLU;QB(llPf>G7D0?$)xV@2#fFuot=4#ZN00Wytz9N?d zXloQecz<)xz-9*)KqRA>MTRiSwHd4X4?@ar-C~{GS+cS_uZEB?n*ky+mGyG_GLff* zI);I>a;G+a4+fs+PvBVAF;r8zZuvbIvpfYm=>1a|13Ht+xeUmAF76NnZuLtL@@5+p zfiZ$Y;rc85k~0$G=AfrJ$pue8vOrrm!SMwQArXpth-d~v+%vCL0S*X$*#meBKUamF zs8}SZm#pB4m1_UONAQSpQx|Cg(mFTw5%D5&0QmxtG8oGiFswJ*fz7Ftg@Jd45ibgv zqaO65hq(PAcnLuYCb*Te?jc}^TWR7Djff*4z{C@t?~IZ7msbg9lntT`h}ri+$5#;L zl-^y@@e38$A07|m+%2eGRwHzxJV?KXR&;v>(cpifqZLf|Mh2pj52VB1cWD|PvI`&tBMDk2)Zk6?)x9>|>P;GF!CISB&*7z%CJ zTL#w{+-?FZ;gi$%E`lF0e$xozErSa-1dh7^A%2HIn2T;oky9rEcS{k+4O}F^Jl(&6 z@HRv>QeaV!LBFMr7l9(OzU<5u5Uhfr$2_>GYYWSm-BG2p(MDtVQPhFKl#ZMX#rkGL5a75qp4bB_jrf6qABA z{)362!_NkR3TR>l9`l0AUg2sMy~fr_(pHX z(h34@0jt$A8;qbV06JS*5Rn(GR7+H_JJ_62U~__tF+l~E2wUe0f) z%VBv_bqf{f3csgc8``Dh;5qIt1R*$d(l?fue2ue-`5bAirOJJ@U$4w`kF9gr_vgTR z?*Tssf=y{sD61mtCx+3XRZ)4SY%7{Ln(hXu z2-TF*6N;vqZcAb56_r)WiP%V;;6XNKNo$JB(xAcWfSB5lG;nL-y-=9~A>O)(Rvk9S zzBIF%qJG>vURoaO2>0wmXfAXO_A7-uEDml^6gQ=p8*=P{DZ_3_fWYGSz?)l81}VX%Ik2J zH$5Xke0|X!RCe1?+PUedB!}kVFj75;(oUR z>@0rRYH)a|nq5`I4@|q+6im#d)0^eJe9O(9p)|a40d~0pmMqEwTzXq7?4BgO3hrbE z=C@dZzP59B(iDd$!-Ld8Xgcso+9kE3svzYvf(4;DECX{;yG@zv|4Ognw8`_~Npt-@ z^sq4b1KLRI@VK@ae0dY#6yN|itT@8|so%=P2>{ZTjURkDeq|rAsUX$iNdEjjV(292 z2M_A%W31jkScs`mU@RTP+FJ%_@aOty(lc%X&gVkWL6mC@fK=-EPU!fp)MIw+o;~$S zf=Gz2l$uYFK;g`fU752_EGJOHx<_uxr{fSCIPMTCu;>sd*A$s@6h07<4@ z2v$+TK}5o3nM!Pc*p2`dB`BI9K}>08g>a*MyZ0!Ah(PP|FXH z3PylKLD(Z5E&_9136F>xOHTqdet|5sI^Gmif5C@Te+HGvrG*$=_DCGjPBi3jxO9$m z@=E+Xg$!5)wzCHjC82l-5mQ$%i6_F}{2M{_eFH170-+#&^+bTGb3GIhnCU#SLUIgo z@h=1}L*R%Ami{3W!7&JkXIFGvk0^8_)GfG70CX$=JJa}O4IBykFr1t|GP#X#a(Reu zMtzTRkxLm;sTCY}Vb1Xj0>{|9urMvsjg+i_m6j1v6Y6#|BK`?13`slr0dKsOHnozO zod*a*2iuyn<{O#km!u z1z*<&7!`&bE-wuc9JnPLR50^9^>Y<+DS%Xh*vK2A4VHoGipjo#@ax~OX@J`j*l7-7 z8tlLD3J7kB1mP!VBLg7ELx4Vo?3RPM1{)qy5Q-^^%QgX64EQ97j(CBgw8(|H%X&=) zmV+z@K&si28yH+Z`7|%=Qwm{u&`^{_Lk6bdGXBw~>yMKnH})2!EtROph@qo98BW)yPjU znJScsW&G%Ma*Ng5;NMJ-5LjD{dbB9>MjM7ab25)^TT$6Y@))7i|>T-RqeH5GQIAa+>%B6ueOY5G0Ob3DlRjjk>i9V#L>bQ|o%`GAH|`?f4N4 z*J6-tQQG!*MMv=V!N2t4$X1)!2n9qt@ec)*td5rs?+kl;-%818;K8DA7o?o#!sK_Z z3U~_kiL3!y&_(W2g0PgyhRd(&ESzZjbNX}5SGb=$>TGNJdzZ#p_Nb5TDZEAL*@+i* zzv-Lpw7J8BAvJZ0l$ZBX{wIIGJEKm%y@MW+Z}~pXnjesRY^Fqq_>$v4v`!E}(!^1hjJZ;; z0J4yVR5uq_!KB&7EwqZbM`LNsOUDhLZrYyIRr0H3U?oQ;dls!$)S$ z;QhoU71x2#MH_&LtqF3Uv^fxs37VTU#!%YEBlZW9s&ZCXAcp{ubrTy8%lEye7CzWt zJ-tYK1YUtlDeK^?SZiF~!SU)QnE;%Ku6{(P4FEKof$fJpH^lt7>AyU(z@0k8p|T+| zC}2=h{|5ldb$~DtF#L^>?*XvlR~k`4bwkK-szKYJU98 zU4S$i8oyV-^o9-M`#Be`3xabIY!tl9gLfsM_uU8GGeju8=AdXU5=9qaU~ooa#M}Vz zlnCA#MIw+2OaeSYL=N;6b%;oTA~*M>vT`lv&2uFFQQ)PC7sruKJfQd|BEkzqFLofJ zIbdb@;eky985oEg$fbx!$e=C3pn*f=;++yWtadK+fnpTGF5w{8A%Vp2ad?%5%d2$W zhmjr7B>4#Mgo7bm0^vq5Fb8ra5st3(-O9mO1{ml%h7r7sVuVa6OEHIgsY((VhB%xx z7h#w{zhI0sDm_sGn?mwE|AkY2^Elj`nc|9WqYyYJjZiZP9K%%prX0kPNp(Q7sS5#a z|IUKJgq0<-E*&f3|IL0xbI6617v_lCgrPzm5D`%`0THkN)@?L0!L$|Kb|J0Bk%eHQ z{);5i805*maN#P1d0zyPr>meS3`O=i*aJ6%r3+&_peDebHTY&Nd}9JTWEH^B(tva} z3Sb?)M&!+f5)dSS>ynT$8_MTi`j|#=1qQ(tT=8G|uTiL5A%w$I4u|3c1Zz}X;|6YS z!mMmRAYpXCH63m?t_A|1CrIx)0QkW4Q3!AY0Nx;65TfK!Smp~-CKk|?$PEK7M1gIA zD?cm9i4S6{54#X+Kqx+Bu4H%-ss&j+so$`bi+4O$a!0ZVumRv7bl!q6Yrg@E1NU}M zHiR5b+?*JsUJI2F_)-iEt$pNo>gdBf=(zn~kcEKU3_1%e-oXK3hQP|bL8{9lzygxh zg(rymS$e9vU=HoD`oEZ-N(B74!KY;gOU_^tt0Bg?glj5iQoo^@qCf6!{*+8kf2?#NJO%9s3o3gW zouqOxHLfJ01v3H8tWDs^`ZoA?TJJFEZcPxYPuXo^oT8$*{A=WEpZ%~ehFXLzRr4>x zi7(ffI0>37NFlNKEZEiHBpV>belu^aPMZFu{Kh92|@% zl6`RX4nN0lw3X-M{g%LAu*hRuX@D(s^$%c`0I-RYBnU}`(z_WBO@C2TYUWsK0F6L@ zcMfb4n9xX*Q*!Is%TEu?_w`#k)xGfD=IzrJZ}$h1!ZM)Oyd$thC+6nn#3r(f)V7sY z(EZ{n)*@Nk^I&-a8zgGmmNFNPV|}iBXRe=VoTwrb&njZ~)-8E)v&$=k;b*DVFUXNC zy~b;NvKDJD9j{o-1o%dEM~u%4kX2qORf6xdX$z#tODA8N}|Vt?|&{%$l~@s-7q9qfg|I;%D5;O;8|w@}H-_=qQ@4pT4r z<*DY2O9By96YUa!p#ZzC1(m$G*C|R&onF0+rbm0v9`a^m{>g9cCE}WFZUy`rVm=bID1wJznyXBMJ=QXP?5827SU>EMRY-5I!R8)qW;VmGX;I86?veVPBOi3AVTi*)L4x&;k7oN)I^8FjFJ0#l*Nq8tA9XHi|t`P z#u7L=__>@z2}#;g_JR{Q2{;4D*RTvN!8vw!Z z$z>*VR-l420u^xZgp-fJFx=ouLYzzSlvn|G5QT&`91Ncl+)G6N;0Z2jL9SKTAhBKv z`pYP!sXRzwNFX!77_IQ>#)xiq_~x8y1GfuTGE=)i?EimX`&kd;Ck-jjSYZ>mIWcKT zgtoqdP#~oTA#hj+>{1=CxRQ3;fj}u82&MKQASW6Q>J$PGihifvHX#d>1Cbc`<_uUP zN#ramg~&`upEx684iq1AOZco@fahKyibGHo66+C%%)P=R41grOD!{#nM20u1zQFv8 zI{E2JMso-pRe`w#fYlp9E-v|;fQ3Y1A@;f7SvM|bL2gNjg*47t6$iH z?H0t61=u?pTx{iXMVPUbqy+fu4*+NeLmi?J@lPo3MMO3fAt$hoL0$oB6l5BM0QNy3 zg^+~*re_RSXc#0%<`V-(4*o?+3WQ5ODwyBGq$}7sn4eeUrsH!h9=8 zF2ekYAJu$nzvKX2g4QLTASQ@u_G7;5h7H;_G+v16T7KSQ zCQ%623L+nn@KDo>H1{S7G>8r;6@kAs6n4D=CtZ1T1s%bwGvHv|-9dFdt~|LvkFKRT zBeG-9Vm{LfeyIbV_8>R|MfBH79R_fu>u7AlxGnhWhvYV&x5t6)Z-(SdK@bb%r+5mO zELlGQaeik~KLC~H93m#&0Dnmh@8<^+Qv(0R$=nKJwdKs`Po_weL_#`dA879w%%Qey zK>$ufVM9aGAOI=g77zP(f=~|ZqETK34*ye6|NdfHyF=OcAZVn}kAS>jLKQ%VNfY|J z=9m|MrnQa&eg25}*ZW|x;eO{l7IWA=t@9Wh) zOM7(bc<5p=z%n#8;A(hK_h7yafHULLecjbdOHJnZ0s7JKBZ5qeCCeiWmz8wX7vLX- z;29QRz|Y!~BH#EQ|1hqoa4{RU)lYk^x07L8Ww}I4SUht=s zYIjjzN4e`~$)`F0PBp>1JB_8RUq3nwXATM zet_G8Hv(D4e=h2fZ(Cz--G2Z4<85Q-$QhrKi#@;V0_f81@9x&}KoCsDctXq9YoYhY z&$cF?tXOrVcX*Wt04PimE(!dRV3i|!FO?MoD>|$OmzpTHxG6PU)3sjMSS8qT%?n!? z`#mF+T$-$Ir4{x2{V(?3Jf5nyeIM3vCq?d3AycK;GKS1U=1PWAAv2lE5Hf6Iirj@% zHieK(Mdo={LbfSno`+55Y#BD2{kyKUcila|=ktC4`o8a9@AJpgy0^8~HDBj(9p`bL z%M~MusB9oQ0X$@41zy{EVYO-CbD3A?Ozg&O*e6i6d~BrFTWwQt)p*_{Dtf)Rs%UM) zsITNJ1U8Uu-2)=??gxU&2$G7L#{O^qi%WP7WjQRrWjRlMv=Tb$fh?y3>T)Fi(B%|p zy)OW44z&~Ta+}Hlt@4IU3LhruIQXw$%4J}psGxB9Usu#zcJ~RW%K`A@%`nJ`%*0E^ zD@BmFWCR$+1=&d z&XZXUYG@tXkxE4%kN}4?I~_tevg}wFp3QF{Z_JvY_-4pDYU4ooR>WMF?*tu45!l%N zB4!XEB?wzX7S$NH1C{}?Or1!x|HLeC`=vOTMk!8&dt?C(z&{9oa0aPlA=0-L3vW7`W6l_1=d8ZbhMGc?^@0Nr&J zk_9_tl@9*boJBCu|0G%VqxciG!C=^HK>0QFUjG9CUsquOSuhP&-v@AjfB*oIZ|_Ae zfdu@+M@53EpC9Rz{Y%ZE1+d}EU+}sJh?-wM#{1x%bVq5#I4imAVWF#Q8?XV^y0e^V@(o2<`THIZu~ z4O~=v5<_Nu%ps-$&WqOE{#FniC6P5W^W(>?_pCxm=O6wiGexv`)?ft)vj%>t@%-4e z<(4Z@pj?8e*T9(&2v7s?F??n2#`|%)Xnb7qe3*V@1)|Y`&d)JeiZUvGuqAhGMSG|@ zcMw7x`FSACc}MmsBY|8e6eYDL>T#r0Z8>c2oJv8ISM(wd_)5r*DdD z$oZ_&Qe|J$5oX!}M`FWw6nJ3id+nCFpKlN%1>qo8XHhuPE_3ncW9xxr2B#Q3`)fShwZ z#uM;SKmcr8aE=aV{^o7pnexW9c8to+2BV_iLKpldUs>#1mo3B)8%iTun0pc@KRpeP0emqky9 z;0cX(8)@a;X!)0a$d2y-)qN}w0L9XdPpsCmy!efR0LUeb^EI*`PR;X(>0YcYd5%;E z`g%PT2@{&Y>D1z|MCx7cqM(_AI3Rc_Hi>j}m>QF5gvubq-&Cyan^(4Z%KoUaJZj8+ zjnHH;kNDM~u4)uS=#N155d2GylK9{L7kzwKeg}Xocm9PeY4Pr(Y zAlYt%5K#+iz?1=&aRkv`B7y1@ClS3Sh37U$id+NcIw}kH!Uv>%#u^kAK{lePL7%3$5Rwraoemsn z;M7khZ3Lld{Xlk~zzDMMU~vQg6(86&zakzrv?SynSYVUzS1In9Y-F~fp~m$MFVfgQ zb;JNY!cY?dq;dAJe*}h6 zfqWMwYWq7`77q2i-^noc^RK=ozrqoL7f6Q=!~Xe+%#unt6~2=tvlSGD1{)cftUw3= za()1s@=qZ#RAPPyQLUhBv?+c26$wZ}Kc^U=Dr6221dwoP@(cb9wSp3AS_B-#A*vRH z-K&IS3{s-Q!saN4|A)xz7KC%+A;Pd(!_M(Xn(a#I+K))ro`YK{jGqa>f3~HBMUe>` z=#%^awg$e0*YM#20^R|Tm-X_L5d^=w<3^dNP8QK%65Xf6cmshI`A?XxqJXGV__IM zlI!6E_;v?!^{6*TYm8>h?gn_K~+nj z8~#&FOrcefrwzz2f~}BNBv71+0SK5Pmj;2`eeLb_v5$jus|y`l`6iO+5-;wug3X^k zeS&~KfMfibOvlK1zBcl>hv{<8a!9V7YtX-WJFn-HJ47V_VIxGpTiU>kdOlGcm&&lXMQyp_ zm6P@r4N7aqeU&dQi}0Q;YyfWbcV19guC1eNNeFjzpK;l!INnma^F~{fV_LKGW@oKx zUTq!R@nE>#9$MVkb)U{g0=p$82&a*q5%{8F(&_}{X2~jIsGqs(tc*_a5WHFZhj5sV z3G1Fa=TNfbWcA2ga9A2Z6L>~L=#82{e6Wm5I^$|ZYf&2G_anwrhyWN;HE`;*fr)pF z4`Tvoqly`iAKSo90|^ioi3|`a2_1b9WC3^sKK!$s>W=^7^JIe+m%JHw-Y=(3VjQx68@oI5GB@Qc%TUR-Y zHq4B80+hjbekp@NJQ9J>U+&^AD3&snd*NT&1;jY{y$D*kxIHyzJM@Hc4(Ns`GMN)f zB_1`;%&x6{g(NLqnlV5r?QGy=9;%>PsS}K ziO>C3*8nG1O}QiYf`(#|gpQ$enUv{k~S;4GAwl55vn5V&H!}yc~I-{>QbS z*75=P^H1r0semK~cR0Z!TM?CdN4@*0&TS~987sStIDdGjmtW;-vp)c;Js2$ol)e+U zfN$v(M!)tY#sEYFSAdZ5VUVy#cgx^6fXYlBnhDZgFhGK(n)gyA;yy5fdF&YCBq}IZ~r@b3K1=fzHkGKb! zfsEvC{rDfyh6X5fA**G|2I~L`2_zWvKV3^O>>7qSm4c%B77ze23&sG_GO$k(xWbRb zm?N^?d2+ZEJgD2#;eLfjLJya{E#fvh4ln=1uSAJf*W(0j{)3k3XN1XRpp*y+xUwgQ zm;*ZQ10|bH2)LZzhx@e|e1bSi1A$ayU~LtsAeTF#a^?>`4gYV|3|#skaLOcI(+vbg zQY1AnOU+yG2uR?;F~7Lzce!6@ak)wk%#OR=r6z*Z0fK6u+MSQW@Yx(%Zvr$hobl+ zEY__lAaWS-&BBod0u(KFOq8MBV2k?NhXI*Jad#L>9z>YgGzYOwE)Hhd$=yz=qO z@UK4Mjz*WwSSR|uYC(~I0-!CKdnOSEVYzd@C0B}Lo-%{5Ab5yC2oQfIbXAW;?|Y78 zzAqqe09;rws|4gep&Jyyr*1Tg<1kBYln~ZjpyJC~3kbgcCANXU2?$^@EVT(ZAPgS5 z0f0da5@0_L;cc$XPd6TqU?yLPH+I4GK=J=fg*W*`IBE>HZ1_|b!5H#R{ZY6M z_JYX?kdyp^A*;4PcuC)x!!BR4Ag`m-22v)F-B3VsTpnZT-od-HnmMsJ3!k08_w}<=xUb7m7(Q) zSv1FsUrY=T@-W7hOD}hIfJ8`UeSGfhsq!oWl)xoY0#RUb!mC4f;s`r%C)Q>=oi(}= zx10eZtdR6M?AS#|qv;9!b@cT4rQQkvZrC0E@F(FFh8o=EQeq#Mqw_#OTiZkLSxcZN z2TfrDO#!~5dh}3N1&=Ph+mrc0?lQEh*5uP)bPbe8Wq%)tuDDImXsd|e4tGM32omk> zj;cBnwpVM;@bwYG_l9}r(b1ox4MZEJ%}BT1lD}V{q*gK-{_~l@@H>@ zVu>GENK?jts9}m`?mAahEx_D>P+bANOF(!<;c8qdDM7$np+^Xsv>0UD-ulD2qyt0; zAUyHE1Hi*z#3>~Ld_TN=hWI0>;UyrFftR9!vdNAd0|OcGCs)%AjV;1nI2ZjQWXN>T zABk~(du8g7`Yk{pv5H)_z)Nz(w$ZO6$~;o@L5duiQkln4j%+H46we5kV=nT1Cu}O& zp~Ra`i9ILFYqDm@&Vy`LXy(w!Vg4L^{68X?VDZgY1@} zgg^ZezmyCgeu`AKk>Zx2=xa{HzqBBK$v|9Q6c7U|OGX|1@KO^gIOCUEBORe5kT4D- z1a$tH69+X>pO7 zw6;F`oiehaJOiw^Dca9MIhGji_k(^PE&-{82B@*#&_AFDC%N%)2eC>>1z0EvC9^}~ z)nVen>-PW-M}#NdiDX*m>!&Yu0{axvWMzs51aEqMJ%9WJL!$T=VYWly-U3oW@{6;O znkODBd-d-dV=2C$$RFyQtYxUZl$v#u%~`duDkxEH2BeIlCJzxnUZ!YHkfiX}oxw;O z^gtfFU-j7u=1k;TFwDu9D(L>J?lLAwnt}OHkGZ$UF+m8}gArAxXbbj#-{wLFgQhfz zfBXou9_kGh|4{@?-4iLE0+RP(HgLiTGNN#{5nhn>X4QvOL9W7IjkZP}UjO%D=Ru(4 zxt|;gJVW%UZc%C|dc-vbW3jT!O<1+K1^*~o2NqNW~bWb;!H#gc>u^xx9swRfMmK1Or9jUp8f1tN*1NaB^NLfZ$NHJa?QHj zf+)Hm8#9>KkUVN2@~aA&MXFDY4hP`Bf@Mxlh^RLtn6!t@|LE@?Am>1g6bH1m8>L-j zi8ai$cmFi8J>}mw+@WZQOqn?W*UOm!@@~=tQnt$t#)g6J!vC%m5-kRP`T6@7G4efI z=C{8dT%S$`nH-Tvk3v-w5OdGlZ~VR$iM|I96Mz3w7ziTHAOMd1;Yd=eA0K`>lO#*- z8H1xRjQ++C(uEwMJ*4ex-19WZobOA62sryt!etKFcpXsnSg6?rVF=UEAHly5!e|iG z0A0&nT>&tGBizJb4u?AkwFXm0_&}(NWJr7iVpiqAX4LoOSJnWykHTM$p)gu;i}SI` z^AlS`pSR%uy7!50yYFH zc<#y54{WwtgA6>M%S{}Pobaw1xSi88K=F@$e?Tq}$g3Fj-DYuw?aL?*TyxpW!sV*0 z?G%DJlx9kuCKn0(XXw?53F1{0nXy9Gvs)GspL=wC)i{+R5r>0u*70|EMOwP1+zmwAKYzf}rBYjEp z%gfz_p^GuKs&FR+!bGHpfO0C&g~=AE#Th^iJ&%)X=o;Ycbjw4`Zw<-0^cqQNy!1x8GC zKo$;&yT+oB~K-Kz=kR zV=qpA!(`i9ZM=1hqn*FzrXqm|LiW^SrmWHXUru=gz6vJL*vO_!Aclp#3n%?-B`t#c zJVCr#O+|3q_>_L3%MGY{g3+cvJrA_E!{K(`N}7&m#OeUQV4i`#TWX&y$b(gny8(+; zLUja%(&K={asuoMsT=gTEybml6CgNo=EIRz{2)c0J?lR8#t?|Gt8c>MrxRo{bAlge z$9QfUMyuu(bo)>>zWm0!j;JF$e42Oh*a*8rlEAmy$tJ zosJ+nI$S)P7I!zYeJX;~|&^?N15n#+X_v7T~K_>1Uth2XFMc|N}NRR>fb^TULYZ^+N0`ZIqG7dqV&Fgfg zbK9*vN9#-j8JCmqGy)GwX>*;a6N=RE?G5MM`5u)~zvm87SH8O{>!a2L$if)T$q$S_ z%@|rd&jPy>Bh{-fm;%#!=LA3W^&8v);+|6IS^mbdh>{1g#DtFu4CVNP@_y*o6^nw;??+~E5dw8+5(aoc7pTHTaxbC0jWXAz z0oLmQo;bM>J`AbnA1Oj>3f>-kglI2rffd-scj5k_aQYT2qC0+-2{IvNFTpaeqryNA zY558}kXf4t$tq&H^kOJ=|Fh2oxP27hgb=VMb*DBQEDlGxqc@LAHjLP@D#0f2;LV`^GJlO9DtspV@ zEGgC@%83D60g|Nujb)QSq7pxOY_)^h(@!aIVuA|6mZzrXI+I1ZQ)*eKfm%~UxhtT6 z$X|tl)K0Pz-42^R>6i#%AnB*p;fwGI0R2+j4L3XFNy`u{p&R$&BACl;Mo4R|$sDts zKc=?tOe;DjIE5m}NlYF?y0vjpXeW+`4gx8BPl$kpm4vLx8bB~rGu)+Nk=ZD=rep!B)ifQ z)sfaz&7;_d3|IqIRMv51Z8fcsSUY=E%z&cGG34%?`8BN_Io?nfM`LA z_-Wy@n@i1R+iQBZOr=i3?QEe7$aX(kupe1kfj7BHL=yL)@HiL7D@PGU8gA!*sBL`X zeopYg&&~3ZZ#Cm&@I)=KMC+SOAZNN($~)%%p4wIt(a%+BL6?OEo5))B#s13{SXJf( z?d_^0mOTmOA6CMQLPoC{`<>j<%r8B@DmL||CxTaV2p9`uo6*1f3RR=S>Al9 zy-@!mq1>l6-B=wZNKPI$YdZHg9`ir@)F_GJfJ0ZhxtJhEZDsW-Xh*JoRrh|JF&|74mH9W8|1YL8 zrjM7WL56TH!H~0V2Yai}HXQKIL)dw^`!JOicf~VlaXT*e)Cw|} zeZTJ7-B&zZB?k6@S2@8<-~$LZgJ4y}PGAfqp}e(rCB!J?6X@3MFDl$&ivzo3+&!%F zfJa&7FsL^_knGhODwcqZyE%2B*tYi#2cttJnQbkQ=7vwicnG(L?!z4%Ol(K*;f-0c z@)iMQ^abKqWwZK1v_S?D6|mq{KlZ3prImH5vd8tl(Prax$@M=ZY15@K?s`@+$MXmvV@y+FhrZZ zP4@Q2fz6U;A5TM2C$KdSH}--EPrwA&jBv1A;nsTk)o!?_wj?Lm*nSnPN=`+BR&vap z-5`zuU;(c-m=^=YQ-RMMAl+gH5e-VE84PY(!&<2C^`<=_hg$G`&@;zC&s6$rS6%@p zAye?_BT_9~gmA20)eDGiI9U?`jehcE!ay)sEP<`*g{Lu&xv+0Ohvo!33AweU|K#m? zG@PZ%f%Ov}G~^T#amy*OSUe3Lh=GIJ9DrXT>x%Ax`EEd-?^~mX&-vNi151OH_Ud^M z9_;D2W6PJ22|2)o?O!XOIfR5gi!Q~h%Xg`ndInfMG&z!2%`}w71?FZ?F?hAgH863+ zVLMa<@UEcuP3O#6LGRy|1~4NLdL#hdW^4#&Ey#En>|;V+MHsL&{X8aftAs!01TV$L znDY9Pjya*iT<6SrTP}gla{`tYtUUEY*!^Jt{?t>8Vk-OI*%e}RxQ+#DA=PRs8(uWN z<_Q{j2TorLMKn3sDfmt(U`fUj>U&gf$-A~SjC-m7!(U<&Duz)5?8ww?Z24bsJ01wG z1wHx}oWKFqu^#ExZMNHKS?gFC)5Ei`C2$XNf`0&uUMKJXm?u3bSH1Z+)&|j*(B$*1%C&SpkgKS$Ry+BD%A4FD{)jf8l2%!g;`#Kmx>IeDlG0u+;M%uvVE5bAmNBo_=L2`^OFpm1U>h;Am71 zkmLqCfjy}B5S$#hsRQsKS$8t{9c1U6>XLU`+Rvk$$YOwUTpW!Wc%>*g zXl3z(F}Oa~zS0CNISkk+tXW=zTA<(#@C43;qpC--{afP-sI`;yxOq7lPK*~=_cut! zf|$)K0bp~EK!Wt|hQ<=Bh}_$4IqO=fj+JNEtqWJLpM4Cbpvk~$`15@@V`8$s{2Z{4 zOD~JoaC2;c*fJ*d)DryUe3nD2{kXvYfAsU~_mcrTGqKjb3f2caf&f@!BG?nydhqax zzo@9rf<)Os*#)yH+;0TT)-J$?QvLnyXb_$vD5JVvC=S|8MRhlnoCK1}5Wpq^Orb!P z2^3q&4 zG7twQ^m0-S*aa(6WMiO6KsZ5Koyx`mZZn|1_TWqpCiaCTIDr`=GCKzNlO1#>F4Plm#Q$$rt3K)l6;S+;9w zpQHm)S^3u>D>ju4uNw+{1Y$5h>JBS*7>gu1sm7ZJ;TN32S4X}h{rbU_6R=iq`NDTl z>8XGs?@D6a+b1a}gMDU5UERs8FpjJk(x@|>8_1)kH)J9D$ z%?Ak@%auep&LykBUw>-LdA*``9>aj1q~-EN#mCE6=~&`@tjoc$Ni2ax)x@nBkobIh z8UtSJX8||~7=^kAL{tGnfvO2m693Cz;rf4gD>_f6IX5+rCA+kO&r70gN4gi<$vxVx zdS(dd+Is2q{I%H&!nd}&lvlf3-z`K@CT+5tU5beFc zy1F);nt@H{D-$W(0Wt<#te$PGxGa4-p=su+8-BAhFJI>n_QS%Qn{QP#`mH*u!(W=P zHFKp8IC22lSkLEouOWBSHuakTMv!gDzrAtH*HwBR(-)Y#d1Seqe?eE;+BWMeNMqH^ zE*8-4i>{Ko=oRReK-@O!1Cj0cSA+b%=Wnf?u{{E5NY}oA_nWw_8T&ro{CzlT{neB9 zpS~sHeXo(BKYw0VSeCvz*${84Q>*>G?Da z)POhhS}ke10Ugp_whUaFZn=D;e(+|H(Q-NOmR)QfQ9;O$FCBSXjLePP!2tX} zxGEa?ma2wX`Mq4j%Nmu0K;h+yHvISAIhToWxi~vRC6tI3JAawr@@D6izE&E`8>?J? zv`vc!`tz>yicy+0xTQ13$MQ@Ye+-1pl-<7xJCK`UNUf?P5mkYWbVNJ!j&~lfnwAtNpoK+_I zz%VE{`IQt0y)G~Dtdinc5SsQqhgO|9US(1d=i_fX&ne|2? z?a?HOzTwuz7vSkJcIO)dS*uoB<(`G{B<|M6vd*8rD`!bTS=J_|H#fh4bfYrHop%i4 zta>x1*Kh?1#6r&{6Svg8VPmwW7hO|G-JR;i_>IhpGh>8|d_51HjEYO7>A~r+f1*I0 zWqB5*T9;#7qK|h!!k3ks?kftO&{zvlJksZguTN;t=zIQ|ab@$ml=GHUM(2XE2uGAt z`B3weT`vRbhx#$=`KOz?UxHQfoJ}CQpqzf6TeOLGJ1V~K~d_{y4YZrgGQOh zBxs~y$Q9wd#_^osDlZt_j{kB`fauipB&PAHHGBSFPgGDIG$vhg0;!?oIB^+MLSR zdmkkHe)gfA?)hFLzV;|yD}UX9dRjX-AJ0+FbAaszX`x-s>f6vn7*j3%Ht|v1dYb= zVbEo*bGxCen=ms`k~t=a8O}1Elz)czG9?6DSPJe-+zJP)2G;S(@b_Sz$3*Uq`GylC zvx<^_Bc(6pC)mo%btdMvhm5zS_0Uyk=y&5ry-B$3wzBTpaRblH^!324hK~e&rz$MT z61>qjsF{5so;5AD~>2%_f`G#kP zmsSVEd~JX3^(qlYYQMCHoq}8>`?xF8oy#NKgumYH+nQ0cD#t(LS(H?1)&%PoSZ%M* zgEKp3PHim*ds$77j^B$%E$B`QL^)ok4qf)r^t{U}e8%#!?~b&xj?S=Ht?Iw!8gP9PSXSt-9OA6%vtBTzExDK zmG|?r?smG~iqagYxz1i(&KYAl<4L9Mc}dysjR`N`$(X8ZuP+PObBQF#tW1oRG!fOb zHZDG|lfj1)Q|ncBAK_L-`5iX|<=$B8ztVfQR_SA{OJ@hzkuS?st^3%Tb74~Po7Yzi zt$gcCn9^GpA6#djA~n}mYb`x)R4Y&Ch+4w(77{bOCMEHAKmgopbzV?X#IQ!1`*S{H z{#RhW^j(whW7k|%s@+N}&qG=`7#vhZJRlY|Ukduyq)ITu(B}gmuBz=i^`l_5s>9U; zqipOrF{roJ=iRzShD?(83TGZq0A3GAPd_c>Fq9c?mUahxzn;TT?{BQ2z=p(dvB$Fp zc{PI8uqH<^mg5SJ{vS}WHc#|(w%YJ#;YB%>pd8Xjdri#)?G_G?w6K?sh685HPkA^z zo|u%jTU;SLZX|7k4$Q~LvrZQ!5(PxjgeB1bGVbRHS9HKRwl?82o?|k8Y-~zg!gidKT?u!enfPdmm-SnTk@xfX!rm$)$F~>P&>BhHWfI^#^cnZ@ z)ikZD0DW{YbsSiRZP+`f*cQ{})#8R@j|Ms~dEoC(E}1Ey7!`CJCfu`wwObYyaYlD1 zO`B?c)?kxT!19Z$U^)Z7i5=YALH%-3_oZ)2#FNeIgk`}h1|#BKtlth^zaT+;xglyI zLPzep+X|=1O-AwCo$ndLth)p77q?p%G<iKX*Bkgs}?9r2mecpPn$C6 z?35kezZd6k&VTNm#htHLUL}bQ+Ys?0iJ{?dPlyNL2fgLwpZciZ5M2G%p0TW5lyA8x zE0NS$@oXWPR;f0mfj@Uii#3t1;dK{{<=T7y$9^u@j=!<$y%Br#jZwhiXk5o$Q`RYs zKNs^c%}OM8F{@3T`Oy<*9YclE=hP$gdo4u@+9puV)Z&5oGd&T(mUomN8ciQsxV`Fq zN-rXhcQ}6K>Z1g6cPXi}gghH1^8M@C9<*Z>xL%p&1d%d@?IDaf}<55Q-(* ze0VABEqEM5{p6dSGoLfnl8D;-X7OXB3{p?Of@j+sPK@J$(*`n?jtzo*7jo9m7Wl-x z`Ak3Q#Z#kww)U7<^P@2lXM0ZNWylfM1@s=CG zdY$Zi-etf#X+1$tgG=_&Iv*`1I~I8kNQ^MQFTae2}}HdQ%!yzqeh3&gxvJ8&SEa$u>hf zQXC(LJN_>ICdzB-;1317@oICiO9$`J&I;#?m){z>ffl@Fk)g4|&7CRtB|g)REM0Z? z6NmT5@Llpyu((rwOP6u=)XC5v>Re_8H^nWP&Xj&^$V{>xcm5m{ko&EmmNnGv(_ofQ za>lL89%Z@pMbcBttyUV_5f+=Kw?9f$t_)mcu4MchTl2XKcRj#0`F(SDsC;he-6yAH z*7*YQ*aKf03}5=FCpbO8U(tO&#FzO_=aIkpEb+{z4SD@fyA<_5)eLUkO*(|NQp)9} zn`l3VbB+`Ao_Xu>xLhehnD(&O`^=LAUxNa3ld^P5`+bQ~sY0Vf2OgIt)Q=G1-l#C^+KNqIOEF{jXtCloX z53l97)eibNY9?!o9bu5Gj}OQdOBu>^8+-f2xmNAvNl=MFwa&nNA8nL+=9g&2)@%DL zVv!hXTD(OfwGSMt)0nP( z|MFWAD)f#QUAo@M+k*80&2o~UKBW(*S&AHz4r2l?j~-0$uIhWzuCnu#4O8m<@eRT$ zOmbb;oNLsl%zMczCUaPxn)AKyOF8p)5BmU`a4@xhi+Gm@(|J(kuh#*&?dx;KHO?Oo zcq!%%WnRS!Y2P|D8uikwy-WH{ePFJ1x1YC~&rRW7v;TauI)uHXB$m4nV>>jdF(Qegqxcxt7}M$)EdoMiG!)|n)X z*{dGqizh_p5)U`Ll;Of?kUUch3|@USHck%@$hFJZ9Qn9g<5>R61)drqpokrE4y`Bt zn5I#b6pTmxrIB-ko5r%&$lLm)xw>3ef9FOZv(VnT)yURpQTF9!39W-zTS7qT%UfQ3 zPnW{ABQsp>zgVSpg$;HKHr@9JKCM&EG){3X2|lFAwOxvm6((<%GW~E0)~D+^6&MN~ zlf!&Oiu{+*Qw6&pzm9X9PNl6;=8T>3_y6#P{>uwFO_s-^IVTzL9gVz?aNDxNzprr7 ze6o;<*HzfRdD`jj`GvnmzS~qO(#z zYGvh|Vs9PQy&%1-5I312%hIua&LA&6;|FnUvZpKxJbU@xV)_s&Ln!h+DAqtno?5nh za4qjqTVzvad)jf9ZfXUWL77w9_wa5SUNYO=U{!3G|JgyU4Cek zCPhQ|X`!WVSwrRqPtEbOD+iTTs!N$;K6`-&X~otCvm`^@h~Dv!huT)})v%}ZmaepA z&fen^J}WTG_&RjNEgS~|9d%xP zk1(?s#dz0-E)+tzrA6YQGQn|Z?LF~<8N%& zE)O?z-om_qWoltkR9y^ZhmgG2OWgjvRg2!*qkOni(Sd!xeP7p6&8=)#9{V*F2A+e?H%4HI)(3 z$UiKhRnL%R-Jx2yRas+`Zy;0aR;7@S*V6nrz>#OLmwq`kHMeey zc=g6Q>-1KEvL$1aI>*TmiYHnjVwrkkbFt7{*`Q}dq_%BL2#HO*cy zwqz26^dc9iIa0aPHxq^Y-7#v{#Je~H=W$K@r!pSesD)z8?z4t+u_c!DGl$M+wupa{ zxzhKgY0XF6CA!{W!Cmn|sDSk(NSBbT>cZl1F>v>f8~LYMZUwe=&U}--7GY!*fH}rf zvj!A(<`O1<)T*O?O^!J?}JK~pXuH}5(|Aqs|XFenn%ai!e z%zaqd>2Xjvna?K4VT^ivhjC5qyY<2ZAN@c{i}jrsP*WUAxoav>LK_~(F=lMeG8R>= z+Wi)CmitZibODkQ1lD#pSscPb8-5s>M-xW*8v1fjA8(Geovs$xb zc9Yej4+v|-ef)XSZqGu?fm@u^FU`7_vFF$S{-C+GDm^r7Rmkh|YVeU!tHJJ_;3Uz! z(qbbT9mrPioc4G9g3;ca412X* z&eUHe{I}e0U4zoaDYJSf>&#ZCXZ4eg2lw9BW*1Gj=zgc3;9WNTfHmgIVewr%&NvWl z(hg&%1U|bkeTqA{m*eF!ok-wSRF+$;t3=(hAj4jc2e^c?`DBoZVV6q|Tl|9xmE-rp zHpnQBJsPNO-1}_(`^NS{p8am|PS9Wp=0*wS6PT>S`A)v>em=L^PZhrhRSd!jyz9%& z%b^(^Y~1YhHeq2gE8hmq-fLK@pxc?nCw|T_x*^r*6@!dN*?Fp62{L+in2IHR(h)_1 zVp+X!$0YUE+f1wD=Ral;9t}oq^MXS_%X**v;Dx=5#TnMSKdIpiRfXgt-|~Yrf_8Ui zQl~ZZ(UXP7%zHWRcg8KOO(w%fdaK{}-?VS+A}T&E+Phl_DDG9_EWFE0b;kt)a(Ao$ z)NpscVayKtWZ%RoF~Px|PxvqzG0MaD((J`Qd3qlA))SrStEiCwvi0;?U^03nR^pYX z<#BJS%!%(-Ep0~>^^X=ZiaFl<92PTl^6zTULlI?dnbQXgzJu3fC*iz~{o~j-4BF7} znDF+gw%8J@*x=sA-rkN~qcyq73f(?xDD_9D@+5-YRzb&58TP(ulq}`LoQ_l5riuJo z+|b(#ri29eh<0t+zJdil^;3Fv+e=B=M;KQni;((Xcy1%YH_Sg<6-r1@+DLZ?Zp~b zmNJ8vcx-jke4MKaqjX~PL({88sX-lWT|<1)oy|j{?4?yhjcn0{J|whzZkc3Xk2)94 z;$I~1OKURwaW!o=oOV*w;XOiK$7SWm)bW>ay|lI5r-umC6%jtrC=S zx5~#=lr5}~zGDZ|0YDri?uiY==ik%lmbyh(#Qf^HwDekh*mC#-buaz8v#a|H7qkkP z8SO3}N-n8s&JMA#Xy6>#g-{C0i`|&#ieckJ^Q) zZu&M}d~4I@VqeCj`($JD<2^C$V{$Z^k43d)I`VY9=?!{fblz_bt#W=jY1?-E@D++p4Fsh5snH+iQ}0 zMPDa>E8=pS%=8(_t`S|c^I)w%?K3)xyj?F z-<b!aSI4+ZWXTxwJ>VFvt-UHJV^rAf!l zi5_;1F;)o=o1)*c+4gT}F{rYX1&NuB)QrYn84yWTT0UfMpluf>HhfZuv!nQDjq4)! z;7bXvZ7CC?Qjvw!^8;83C2U))j?SG_t8Z$4uoX@Zp z$FBE=D(dT!x17kZMXu*~UeeD{p2-}qyZSQv?)2ghzD8O&w{PeTHJm1(%T|qlJbfSF zyhRg|y@st0q18QF=yaCkZIfc5d<}gj_ z*2X10xbo@lyMSB=vjltBFT1CCcZpc}CpNFMj^7kJd^lKTLB!izR*9sn2ry!s)$(t9R;~7M^i=Y`i+7T=I?1TH{zmV3CO7#G9bgS^TGM zt#{+^g5aCixe77asJ_x;*}LzGWLL{v++aS*g5qYg*QwR4=<$wUI~)F7V`phkijARe zg50hIqK`I@u@{C-EU>|3a3PiYDC&5X_w|hJy!gm_T6Oe^VQps2kwprY^|Q?SAU(s~ zch(JKV)7BfPl;bQBm9{~2Apng>G<@i6^ zWiZp29Xwae^%shBUd^W7(ir-|mH4>MQge#0#%9=xf$2%boqIu7v&@YypSWG&RIXy` zAAmo#61qBa&hN0&LfeXtl}5{tZ!v6_Y^Ak4kM40Tyn0p+fBz%0eue-ES+HVds(@?x z79DIIjaT#4a(W%sHY&oyEH2z-l4b7DDSKH@YD4zNKiZKTaCh68`P;koD1e6;>^=z&GQ1Mt?? zQ;yRkG%nahrolHr?6lOs`{)If^?uAUZ%T~l9G+pvlkq)2rMycRk8}Iee~J_-#K(p$lcd;$MZeWjoyFvK7k+_+(9^D}6~3UpHTU z{+;L1!HG`K+<@dKeiMRh#JMy#>J#_x2v=&Tm+;laevwid}e?8Ds& zGIzhN<98dqj$FQgy=maM$7+H%S2|t#{!f{Kn`NFnTfO)C@!Vop*!qct&l~rbcKB3R zgT3db)07Y)J>hG|uKkCfc^kI#$8Fj8l;#tdwxaYs#SFvWKHe`7nR_)y|AL*z;ytg( za*Lq3u1aAc9U5GcfsE8dh9E80F5huFivPVhEw2F4`m9>n%HQ-ubkkx_TD-)47x{PY zKa;>`!7~{Xz7nN#Z7StN@3;0mvr5mJ;*BlkI2|vw;)Wf7JCeBV3AWy0!M>~m`mpwA z+zWMl-iT_RDkI1RrnjYMJqvsOt!qRwzVNj@YQ|=lpokr?DG$VlzmCu%3*eTEn3GNu zZP4Re^24cj^~VS78y5;2OQ!Bzyq)thbsw-LD|~%CT>cMdJSG&x1@VhW!mxj=V%Z{fK?1E=_7 zYQw?GA;nmH_u{KG>*uDZZ+Do=tC0S`T_4&+M73;GmW0kES>|UPm`{p}s%+1;7hN4* z^LKybps^tSjbTFTLXX6tL!ZHJt1}6PVc$fB_a{oM_#6vt_~DvactkBR>qpZkz1YTk zY6c;z)YOXTk#~cZ34F0JCeCM_?tCg>e)WDJD%$&mvop2nTT6!*22;^t`h7b?i1~vF zPtC4`WH2s!N8UWc{{G{p zLeW|GlQVnqyY{Rcb#SeLO|dtuYUHG638{;TQ{C)~>>B#f($2B`;oj}D=~TM_WqSA9 zT&{Lu!J!6~xji4-X`4Q{TTfm#zvarH*7!9XwXV3p?QJ02p?g7y?+hS!bv+KS@JQN& z%7-ZChK2uS<#&rg{6y~i_*tE-nBXR_57yK>^Lk!dcL?1a3}FdF#=$bL z`f#`NK~D88yTG=}X`B+v$cox#Y|^ODo0w?ngZk&UWnvCz#_aPAuD*O4b1tpb$-W)+ zbPLC#^-W!J@Pk5k#Gz*U>O@rSR@?-ca#72%FQy6#N%PgQS9nd<8o4Ik@0yMS)s(l-ujFCQ;SowJ6yi- z>*uZxN@%p)63Cn9pVVPeM5C_B2Gczo%P>w(-Iqms`I=&H{$0;UR{Nygo{0^ovVRTj z5LM1Qkg?qKX+tIeUzRkZj4g}4aS<4Sltb7_H4DSpw)U_=R_vr@h{J2H&7uPtn>%wK zcxEyTNdvg6Q>fQ1rc`rJ{``Dlpa0{&*o%{&BYi5a$UMrA@I7WJ9q962Iq8!&W zRx#dN)3w^~ypHB&e2wktuy))5+R)RM&0`nfH*$UDs?~&_Nzb!s+u@a$R=G|FMmJeY z8}{nNQ!mmCb__gz)kiB^A3LDwQA{}M|M(KhU?3;V%kO=g*G&(uVTl`;yBV>l&V%!~& zHQF2_hEEBv6z_)B2e~cvZ=I2&eG(zU)ez7YXzy|Ec4A6%-mDj69RBEonH}qmP zc(gRL78d1VtkYhaDO=yV%DsPKf3m$~Ddynh`VWT^*RET^c5_)AR|GStd&`O|$}n=F z(c=;Rk3o-N@c*a1@BW7?`1+R!2_m8g(IR?Ti54X`ksx~SJvxi%!KxvNXh~Qi$`UQ= zs;igiRuFylURJkQZSgGmeDZxg-@oCRU-os+y?5s9d(NCPcjkigQ1|qF)xpXRAA`0%omDAbW`^^&j}^79)FykX|kfA(hs@?T9msCaLY{&pPV$?`F^^t&d?*eHT6qJ)`OC#FJ?DRd) zM&`JF?!e{&INOTGD{BAM0sr+&R)GA!UGrCWli9;QO0ctAHJ-8$Z}hG?ym@Ri zG{C2Os9^Fi-M&W~7Fm|Y29cD1p5IC$&$gw(m_lvGDd%?XxU!*?t8&B?8klU-PZ3uY z!$7HhB|6fJZVRwK6JrtSs6^SLYI1{zz?-|%#oj_BKgtxHZ8cYWD{tNQ2t}VoJ%zss z)sbcz{qlagmuMTwc!J|2scmK6oD0W!_i@2P)W~q#w1>pgnxDxELVw&vl01FGj#0ZF z61A(hQp?6R9ws+PB=!)8m(@Jcp6{@YYPDeDdq+nwP`9}m?!38GV5+&d|13s1C-qiC zZ8D?sp{+%j{GuvtwCbYa1PTIV58yeS;(am8DC8|i)n(F#eJ1Mm=eXolk5Iy@Aw}cl z;27lX>oG#7Z^9#4Vpeu8#4u~@KfS+``9&0LshUD9tI1p}H8adFS(Gwrbi>zb(DShD zJM?~T%c-%q6Ymm6_-%6*)+&`%KQ-x?=9(Z2ZX4!hA^pGK|HI+`lY^Z{`n@aN#m5vy z%VHKJcqawRd$qVLJM$e?+=Q@H>*n-67brkzd9V`ElE5=X;*PAtP7ELQ+j-E4nl--_ z4v4xPl;D1_WzA$n+J7Ana-HG1!ZRTCemS;%LsP}#PNWBMn-?-{s%U~XiiV2TaOZI;&A zJbzTIc%hhRClxmd?%ldF6`tH==L;vk2FuA^GHv~3C!D)t)SDf=uopKs>`^}EX=1S_1;!KCZ5*DXvp8{UDHmRZ zU8Xf>XY^r&o4rT7lZ(zU?t!ows7j+X6k*<%cOWS*UMN6%&;T$Lu9n8mPCeul0?2~w z=0&IK^8yNj(HAWc2GI z!K@DEFy(8r{5W1%j%J*iH9q*B^@nqZ4j+4Yc-(}6@987poI6cRAZd3g`ppj}Z1(d> zY)MvV0J4rG9Scg_5yfr6l<=FPdK0#>08xreK#b`HPOM&(6UpNbaM3&XkBm%ZRlEaW zo>}udDw~XoE*SIzV#N5>9`&O4J*I>Y@uN6W4){`z3gj-FLeXWhfw}}%Oymf%NeFG4 zZ{U(r$VgMe)ZpD9PuvHbyEiN*+3$wGXnnZv>2lEcktMP=G=qsD%zB|QF~B_?R6u1d{CM|VP|ZH0 zZy>zSVV~KME6~6y6lT(qPUuA;dhId}zjx#LJ#QlUC`ZqWr>y&Y(6L2>(#_!R%^0~3 zv5E*jc2pvK>M?GwYncNUWtUk>E@~F}>3De5^oq}eGAIrMLWpXA^73E+-Q^>ov+9?cbZ&)E^$6%eCgk>XzPeR$Dy1NKBp_QbnObxr$OBR0CX-oDkrTDy%;!2VqcpfY>U{)%VM z$*wAR3x=}#4$XQl#&DQL_n^?ka;;()S%Y1^*@!^6WNF*>jfIxM*5CUd*gZsY4Q@_m z9o$am(^@yH7rm=d+4*hvt4J3^_heJNPCRp7&; zeC+WZ_hEQ5G@*}ghg|i|d;Y!zs!YMI(o&TjIyTVZU>4MzLO6^zZS-Fj)9 z)4FP71=$_YK0xn>6&(B9%yiLEVW4M+soOUl{JL+Hv)16bFxtIQVt0FJJFAqB%(!#e zw4?^EQ}c=!zvx?jBE%hRr?@*p(;N-gtP>FjnbrFr%ca4w*nyf_k6K68YlqK21st;> zz`kwy|wfZT-2_5=L4$$oZ=>(5rqH+3uWWMSmt_4{fv7Bu9Jki|DkyVKSUK&o$U|c{8R?+(|cf+a&PJ z=>8$yo^pcRYz`Nu~!un?ju~io)^nXGaAHm>j@3 z6YaQC?0tAaO6q0(n$cW-jfh7uyYqw;3Q3=zQ z$#}z@=vjdX!phb+8U$v%PD#854_c2~@0X(pwC#EZpR}9h zq0?~qQawq>xXIztrV1$_j34&`AvcNbXp2m0bj=MK>y!92CB2ct{{DxG8;J(kx3B-g zNhwiseCztGQHc8c=XXP06W$D2oMFAhzq}PqLAX01_0Ce#V#yepe@jE}8(!ScR#}w3 zGZf~+#H2sNM5|q+R=xQf)G%NiMrQ1O@XXu!-nOb= z7k%v(i+ymACj~1G)3O;ix+gem?`*~%C#IB(dk(JEqd7N(M*0~JEh`@9MMSsPmc1y; z|B3fd2eBOwCJhnXqKKyV>Y#7T{DG|uLpLZ|(X2!4N>)-4Kj{T*vX4THHRGN}&F9qs zl%Mic2mQeHK+M3@<6Bu2YbPu^4NE_hGy#6+d``-7;ZfrahU^>Gg*qIzP7@uqk6B0# zg6Rl64TBiIIx+voOfJ3P5hcE-W-D*Q;(Wxrt{zdh!#~iCGTw^38j$`TfNh(GPpu(aPfHBxHix0=)HTP~ zgAQ-}Q==a|*E9NEcULD5kUrJ(TMwn?g~F)!qjOn-N>z`&XBj^i9#58gIH$mfQ_j8* zR#?{8?L`bPDXL~(--O-QL~}qMo)tKG?}|w1ROKp1(AwD3ANO}z0=26!>?M7y8@2kP zbDKGg6In_HJx$1=`HK71DK25f&HH&Jzb)3+HoGhLGPUcO`?_eoyX1(q5^5YqIj4C8 z16+os-+`7GV>qU$i{t8_FK+ zSmgHOAqK3GM_+ z$NBqqC7&~uOeO%!#yMIY7WD1#v$AfvfQOpzDsOlCwc#c0x^lvm5{hqw#zK?HKP`Vk z37JDzb_7Oye!uUS*t{}rU`aK_u7qxrM!y8*&@gv+bFvEzOXoBDrjH&c8;~^QPN{rY zw5L;5C->;;q{h{A}dC@at8C+5aP4fUxTkXKT_Os2aUU)a~! zv8?po-EIz>Br@I3qM+gmcE)#a!;F>(XODJmQOrGhReSxrSb;%}*O&a0&8fxtcb^>0 zx9l#q*gp}D#0(?s8qYy54-SxTmN`D|XiD9ijpeeX`!tawy+z zZZH2_Cf*cl7IKeQ8cMuTav`c-(o-&QcVw0CDcTW@6WVS5BIwq@nNE>n%KUrgdbF*F zRE;42=1z~9EVpSM#Hl%KV-Srq(>&{wEzz! zv*(<-sRKu?@>9%=3a&8Jv1#V}BX6HjtXizY5tel;TeijKhtVxDW_oyvoiM9CFuLEs z`)f&TS5HtM%qgeW=6J z_;Tj_Fa2I;268{_2;bsXXaC1pR7uX4r$PBAiEK@3t82dP`l9fxnn*LY0u?2&UX5#7 zVoL3X--vOqjZuiauoE4}d*jE>4qZK=H`PGFkOUfQ4izdd=y?uSzgKC+lG$1H9bwk% z@8=~02Cu_E>n8?p^0DyespTcMj0{gCn8oJAIV6JXl;Gs}I@SnB=6=XLknq5Y&+fGva zG(M8~mk4i~Ps{f#BdtZ7v+SgTw~uzXCd<#Kl&DAxXmP#`4lH~lqkO!XTmL)*f-Mro z4A&h;$4MWw<%lCE89C!e&yifC1QL|^00#BCX9*<3{?;^!lxND^iel77fA*bB^|#@O zMJ$qd#vc7~IZ||)TwHPx76aKTxxGqZpRJ>D*7jW-WPx%ePC@jDH~+V)QA$=llXIkkdB5J9LTXl2-r zuI($)W3Q-OUKwktJO|DhHy*g0mt@aO1W%U8Sbj*w0?D9&1;jjG*}i6!t5=EZCr_w- zjnoTzp69e?7jIZAI%}j-ozXucvw4)p)HIU*D!D>#H^mBN>A1di6WV$7q8|C-mZ^-! zK+kHsgLSz{px3U=DZ$bEe?nFGq|9JRWt*mE3w!cLf^ofg=T0O0uU^F{s+6w_zXsdr z57I&h)%23^NX}VtTOjuZRZOLM;`m?m|Jp|{0Kss&X>K=)yjD#EGl{))5m#vf?S=wFIuMCFt@1Z=Clq zUIeUtB11RL@GnAgob=W|yYXtX7hrP1xYLlv{tF@K&iX4~%Bw~{gNdr{5ebrbve>xq zA!@g7&rv6a*t$E*`&krr65z@I2-8+M{E;1AXK8|ZfJuA@<{hWu@vwA^A|ak4o?@i^2hRT%H2UJ4SjgW&tXxnt9gfv7QNC7O7`~2sAB~ z5?2%t&b3GHlLDQ;3lHy)vff${nEPqrAk5F(^;0{ywR%8NVQ5U!X$r>kiJkJ(jOanp zt@0DMnAv=iDaKmvGr_7{O~crF9^^(hslWt#0R;%s#l%wePR`X-=D7~Z=U(2&)pqkn@2Q>7mLZ^V=Z?5V)pRDz?V!i+CQogr}`ccUA8#cUXRl~IBn$DNA;Pw4@ zw@Q!RY1}Ze@4E0G0h8sD+foAm8;q_VfY_9a;JsrZ&S>rPkZ5LFaU32sRtsIlOZ-aj zVKRRMwTBAFiuP#+_PQb9oVBOnapz}wKdeYA+J|Z%{Mm}B6+5u5aDi-(Mrd;vc9~N1 zouVtVO`3Ur=_VFD<#@+^sXH95VOAf2h>cb=#^FmhSRb16yVVXps9h4R>56zfz91v6 zdu5R-vpaiERK$ELMCd;d#wWZN|5TfxUg)j({c^0pfG_BBYaRz3{tNK?1=glLh`{%`RIC8nk+AZIs0@kWqPgt;@5e+)`sYcO zUjn%fNF&uhPVN3b3{!~Zya@?Re#B53=AN<)*glZK4cni>4 z3_Vu>6;NWo$)`tA=;Ay}D~_Z9WyO3nG>8q>{_QDK4gIfe_83E-boJzyGR*kygraG`-P|Icxt9;o zs8t5+Inr3wabL~#FvefZXE2p*^fE*_u zstDkiX!lV$g7h{<0Se`%LgwoW94b_t{0-UTYBlhfCBuVawM{yx@TH z9TPa0z^D5Wvy2UJ97+f%jf;QiL^ysh?{Zh) z!MQ9E`cY<5IW^h&sA597%p*~R?p2#HXWXec$P{Oqy;29PyVnzBd@6zLOW{j>$eIIT zd&XBK^mG-w%7!m|=$(|vPubEd*{>8$#~Po%r16Q>>j{qS5Q&lr%C+*ims9oiBWDf{ zTnSc=ooTna-fnGF`+v4Q2)msa&J57S2~esi&nep4>B{rVHeqKdD3k)Er_~UA^ADw* zSfSM+E5EX#D%5H&o zJTy!NSjqK!>JzQp_q6&b5E$%n)46=ILTJ|u58kpG!^%4$lZl_ zJBsLPe#2laLTf?G7C+yex_gS6n>?qmne!Ik&v+bo6RsMhqjLcwB5^RLz-?YC`bgV= zpF?tz(qEDh0Y_oS5I2;p2ShU^z1-GSj#3QQi%jwvJy%hlmD-s}yRNe~O&?BUIbXA3 zo?GGCDMn){WJ z%vhax11w;C9fTth6&QiI1KXd$p7a7ul{f#Z%73O8>6fD3idL-D$IkZlw5UYeoG$Bc zag0l+^u-MOlQ3s+|4QCMKKaJTojY~!2&dlbj9ato*A7kNY9vwOGGiPo*=3fel$mdN zfk_#=b8X;&Y~E0ML4p$3w~wJvCZ0cGB1#D*x2tAI_N%lIuZt)VXZJ;6(+#)~4^_H! zFlge|^0o7Q{(Qi(5JLaW=H)z+p@Qh4tq}DSfs_*YPEyaK?5mkf(kj4%yeEUX`GJjh zTGnjc1k1Ml79|QHM)fN+l4^Hs)9JwqN%V@#IT#Kt;Hok&Z-@|+W7Mb5@%OW|;qnXk z`Cx-hB#`VU&QLye+Ta@Z5FIQQ0#w__7spn}CGt290%>8-iG>Ut?^#jk1bhFT|~BV$z--#r}CKSH#mOHl57g?++7QW4YzU0rl9 zu}W^H(Jyhb=`vc~9Dx=S>SEQ2nojZlp>DC)`K4$7-ndjU0TQla@ub~$EvX)0rzCjz z2kB}q^hJ87WH1Wtc1;#lFjaQfow|=4A#{l1ww$k576JxHjF20n5GrCfn_;QKxKMg2 zl0X1p{m8R0TiyA;Ge(!-?wNe`PIxRV=J-u>D*?#Hf4?0NrQhS=_O-Qy+Bq#x>dfu; z=z}YRz9_2ut^^fPK|Ero)P4XZ$$zNfy?uV3`C0Cl!vsA-iJ5JK5V0qJI<}?7BfBIo ztDRlW;Wyri5mD95GiY`mp|?hiz-@^K!l<>w-jJr1$L!?NWqn--cMpHkiLnH`-Q~#> zh{;i$&M}kI%2P?7%nQ=}&m>S%5iY}03B7(*D5)>KH=rM6&aJyH;IZC!Xg_9$H%;_U zVKR>1WM~*YD2qtlQ6B;DrTC%B{GF6A;6Of~l;u{C^b?T4AqS1SmZ`(n>4LyTkBpjR zc~{B&t@!~&7Fd3T&1UnN$dRnX9~xld<;dYaskUBJzq9pVpKGkAARpQ*7CGx!~G6{7v9fC`;91#)uR%ba^Zo_!@T z2y2kk-P(Jn;r_M=AYJ=f;M8+%b!_|WY}43NOZ!E}*3+k2fgc}G>zvD)9LwT>@?MDS z-N>K#e!@;QRp8nEFm>#D>%oe_SvEevr%UtEYWX9ng7;tO6&WcHSoe6%=^)h~eej zpy^sMT3@X(M$rA^8bU9LwE&MhvrmK634^Y^HXDn85%YV~D?RrEHM-4(K8=WWxwr;i zK^iV0O?3wye0@kA%nuIzQ@eE+7@r5ZkX^R(@pN{0`WP&I0T!k^Xw)5E!9wi}V%c;|w1S@OWQ7q7C@o|B7-K$VU@+ zj_QaY0M=h^r&51YtB>8yQM<4#ndKb0YMY|5(SR=i+tomJm~{90)oU1mnLeO@Y31=b zcjy(N=?sQzUXknKS_Cc&7OoIG(?$TlsGsjwVnvxR3D4%eNh)6+59UAxXuMf(Uy^29 z_SW0xZL?#6XLGMz_fGHLx}@&(A1}S$wN>Q`JUM>73uC=|_2|1jmx#6t>=#^cD4GAzHt!OmjL|}XS?^p{ z*QCN#8^h)IO_T8QN-$stXvSE=y)+a0;_@1gIbP>!Iuc-Q z(~NiZG!YlA5?69{WQVMYxHw-(`^11 zyq#-DQ$WQV7YF!zUD3c(KklVK(wwV1;6v<0lm~kJ@aS>iK1thYaM%^rJCBZV)C)u0 z(VyReN7Zp?z_GaF*dq?YN*KC|SB5?T>m!A#CSqgd^_#_cqc5(0Fj@B{T zX(syhZeU6rpN2u~*LgN); z_Z|ck$}YN`V@u5D{o;M1& literal 47137 zcmdqJ2UJtt*7qCrQG7&1ks=_V6j5nP@2E%<0Trbesi7qFUR4AH1Ox=61?eKaCDfqO zYv_;!2)!6Ws0l4_H+s%_&pqcn;c(pudF@GfBt5|o@lDjoMSl$ z0)c4MR3APAflf7nKqq(3o&k=SHu9_hPbXZTsXPD`_g-BFUi<;RuW=s)DvPE*Fgp#r z{?kcS-xUO64Fp2K)gIn|?q#wvao1%-di2|R6N0+E8#<87)iP*(`n;|7 z80+VtHw;h4jKx7p23K@Gc-PVQM$Ct~$)sj`%q7H> zVW^*!n!J<%eqq!uW1lT4bo~Jc^h!j9^~|@Ly&>icydyQ6#=%x*bbFZZhBwj!v=I5% z!?)s*k{RVAzVB26We0Gu8XWCRFi)vKL}u|ji6|!E|FiT8T0Mi%`o7>|LRuKy^9B;` zd;5oUKGNe^7fZWVf5-`^Vx2TInY%-NMd$MnbOO@FL5RLP@liXGA<8e$34=h@kdb8X z4Mpbh275#B6h?+|l6Xd#K%P|vy?UhQ9o^M2l>wkcRu4*A1i^v;U76@9nmK%;xt zdu8LuT+IVa-ldUwOaJFp?a$2-4svQa>x9nA$RSLvM5AxJ&!(+$ZxeS~NA1KC(T5iY zZz+!2-_vIis)VK{%GlM~BFi}VY3fLZUt%`1uy7&BTbaq^ZU2iuH{48XDH$BOIJ8lt zzf5Bz|J-O@hept&fvat&KvVUyKfOz^Wx1M$IDK{A5r>iaT#+GNF0zSMV0v%#s%+w8 z@`Rs`+M&_@3+xwFqs?LO$+^+p7rt#sE2eu6+Z8ce*(24aE$%qn4kgp32~>TBlo!wV zl-`@}VZV7{iF}If7|HuH#~-kVFs6qQNdd(Wv{@Oz~9?qEI4 z%oY4fVsp-2ED8ilrIyiRkkYMlole++%a+a?_693qltY^ZTwoGP7#R|Eo?Tt`@r7|0 zi>%x(mP^yHh<}1SDN;bG`9B}$#y#GCb3wI0Q54WHCu9v*o#vgcIV$lBS>}9QP&D1*h=ty0S&RPJ3gt-sc$rk0e{Q4RX3#HzcA&7gzJU^aBG$7WB2d_;F-6H+06ka?>p}|?pzPn=9Ijb!TO-bcg`LfQf4EiR;YUjK7)H)r;4_QR) zhX_q1Y4Ao-H4|_#o&uZM7FJ~hAx%A5O0xi;q5h(`g+R&yDJe{DLLW8XR-6dzjRR^vnyEE~~n|J#l;-=joAV z=7~}kK!TedC^kK2_s+a_e0<7M&^%f6cm&FGBMoN%`eR8kX~0PWPM|YGHi*jie0KOt@|E?vwtd}n&@v*=^~VvZxIWkh4@+Ya7zhA?Un=O)kz*@^dmcsHx zq)^jzkUIH^;t|0Ce|Vb?a8VzFNEZlfpgRSlYPGV~Qj z=u8^hdWLdtTbH{bYy&w7_tj08nJ%-*eb-($nN2#Mjx6e*UqkW2- zB{}alrY0n%KV|hM>(d@l(Z}-!aqDzKPn2dD&9X|Cnd>DibH6ekP<)~SI zYctpM;mVi~0q|Xa)4)5F?f{{xp^}owaywnF~=6oJG4j1!dI== zSI@+VI#gvIw7M};tRY;IbJ}<8eLZL7-(Q9w8 zh`stg{)GaqpdJaC4X)F`E|H9>qOxg;x7W^`?u+!GVT^gaZ8H2lvXU*5jA5v^v>h<;av@ zJ_gK{mtJ)^JWmmm$(1-7wpw3kb(&ZgUzVTnI4ClkRxmAzdaZHUl_rAYO4%9E=d&1@ z#7NSGSK9jc^N!{B+cRMN%HJO+2n|}=-t|BQKV~hz;x_(go5rpFsVZr;_p@Unv{^QA zgGK8x+57SL@mf|{LHI)Ht(AbJR`>dQ3}L<(R=2Khj#FDY+**-Szxc=AKdI0a1*DBFfeui^_eXn z7#nn1f#v3j>6iOK)WjXsO`$IF@hkx4Frl zgdbrkm`%3X`-TTtU5%27(HJeo1GjHIc_$OB3T3$bhC2P39ZPbWDQd5K_bA-Cs^h}l zwHnJ>j$4bP9CKz``SnSp9CR9wI$crVKwV>E-M43XRK{8kWU`#UBX10UgGaQ6o_hMH zBsOY8$2+y7A{wM-VCDg=|Je1}(b3WEUb9M~tiCbe=;h1$!DWj|{pJfZa{{8x&J8b& zP?wEWKKQXj%tnsR@LOAueb@J(32OYta6W!fe{X7I?|=!$lNu#OM|$ua1M1=;5gOPT zjfYq7s(1zkuoI~wDn6m*$BI?Q6|38B|MYwh3%pL>jNtV0^D+xT-r6%|HNv7)WE zM}Ai!yLy3qSMWaWM|t=vi!uQs2()Gh7)6M>?JvtX{Gy_FZFo37!Rpu*fWDd$PgVXJ z?L;h{KXp919ol>6`1!NT|Hc8ZJ=m-GdSP$}-&FaBKxc>cxNqK>keS#RU@Ul7%f;%- z@hP52xrO}lQSpl4-v{J4I5_$_!#USmV~`Wf)~{^}KmAcrMNe&S8KL32EdfA@!}MdaUWv6krkS2O*$ssFbQyjHlKYX4;z zNZ5cA?*!Qpr)1zBAp(56RZlFX!kCQi@LdxVxQ1kirAEU#1_oA4pC_X_OgkH2ycm2_ zm%7tYw>e{fA}!QoK<=tHoupQ!X8;o+jW|8-vTH*;K&I)2IIg7c1p;xgfwO*a9rJ^u zgWDAj+Lnw;d$->)8NKqD7?2&(%~P+iTH2DFr%q$|ad4MEc^Nfad{BVawL7$grVL(O zH05huT>RA8vJo&k@9*z?zwkcOmP5|uDz|!X*FEYbs8-j4QMAg5x()7EI3?dja^K{x zJVvI{v-4RzNpsqEK*w3hz}b6JFEjW|kDsjCW zZYQbQUzG7MZ*yf-t+0>pz~!KIGtCU`)06mHCk+l8SRc+vz^IvgQgw*VoU}fW-tBB_ z?{-#&_PMgWlERu3Q$m0pPx|HXQvn0AIGs&CZ+LoW@R9O@!8ey5HIV~WSq0*3*6bJ? zW4Og8SE;%{YJkHZjx&bi1m099^YwCftWBZ-R(wXnbnK>nb$E)F;vrel*r7xqO;DiY z=Inwf`+hDZp=4Qo)V8rPHITihvn0^m*-gJba=pAHS9MNBx=|$UOO6@6o8_S;_}#<> zG~9xeB*m?85R#j)HzoI{k$Pp~>=oVFsy!IX@CNY=N>YG+-(nT!e_ z;$bVoud0X}?0w`v@r0++a|V|O#1)~WUgs5r4KXzgFiAFpj}@=o->{QGv8wbxP&Ci@ z&Wg$`TjU{{zb^hEU})L3^5Tw(U%36lQG6H;1`S@%%j65Asv7aGm5oX~Y6zy;M}Iag z=bk**PxuqU7U7jLu)Q$LyzehJCkV@#7?}?+93<9=mb67n@a_9kyvc^OGnO6FhIu(n znrt?*iP`+p4(ed+% zkj6QE7VFh&tMs}pJ6Km9!bV=EQn16-7QK$Xp^nz|ru&AH$%YI}hA~KinGJafX-%6= zcOm;WL_Z1}S&G00EErhDGX&aZW9lQI*Q!Px#^p^_NfQ8 zVu(H00pG?D^K3B(u$E|%kpQ@H-EDStLxY&rxFbTzx$caOy*{Faaj1 zD;=MWfRmP*lgEz3hLAYPtt*Zde8iPUZSto6V4QrZ8ziAbv1`FoFWDpB-c?mzAQ`pd z#b%H=#~^jr;E^C<4)px{b*sUjY#13VK^n}g4!9ix7AXKM*oda@A|434!qDr|z(w!l z329k*YWT3JK{?U1mIo}+;_BC7oonUv%aOUy3MPsLTNay^kk>cCDWS+>?>r^GMTL|~ zc3f1$fGm#PGSzpoNRJ0>tiz4Mt0TqL-(xI863lkwW}P<#odJ z16<7?>CHih2Y^@BBWU99V&SN=-Dx^Dm-ku?fe{-fI8RNKEv2AqAHTTau8KZj)EJbH zFf%+@#6!}T4^tqIPv1^PMHQ*n^r^>Wjh@7Qy1dP<#~2j650M;7=za14*Qt^a9mts;Ia&Dq z%;)gq2zK`zA?*qwjSaYiBI7(lGy>+bi%NW9BMz*zpCp5)if;*N7svk4RTCiO;`J^585542 zxq8=kF*?m%OcNslxkBo@#D&GK#7KE$-Na9iwfjcMa+MtJ-LJr2t`+Q8e%Jv$3j)6< zMP)Qp&=AtDVq^^aC#5)e3FooX_1nBIOk+H|l7m>A)WH4Qn_({?=o5&%Y_#?)(&y`x z;BsYRPhO6Idkbj`(TA8%R{?2toEts3#b@NI5hAzSlgpDDyWqY&_UFt~)6|_m?&8H~ z8?TES1Y*qfu{t^dlBcn)whA9tyC2mPwk4UO>+O?x<(^wv6fsxg)g!RieE9xzlY50%C ziLpE_8|b_4VZIVR`=XKO?uqGX+&?yB@|B+Em7egd23O^9`FmZfEsX0c$@?RBCcPdc zl~JX7f8RTGb7dL2n^y}hmS_mWIxO`D3I%diQ^MsK1dilTgTyES&D0QM9L|XSy?{U3 zXGPf;>(ftpNhJ8%vWVe-{%q?Yg0WVOP^_gyA$I?uXTf9Bo5fKCQm3YHBx8*WkbIb& zD>@e_Lg8=hRfG#@fLRtl_%DnUbs;-xab~t-@qy5~m5e`jalHdCZ4VCE$U}-klQ*an z+Q7A!b_t?NQDr%^ef4pe$Dui%uIW(r#h%h#P?#heetFWbb85j^J{uAc%9akLtu4i| z;vK0hG)fi{w>&-N+$&Y(yGTu|wcIF@>)~sD)3PCB%qgB%x5sqFCGg0ed%*f!Af%yQ zlX`(HCN(tKJ*1?#*v-@uLO)f*_(-BWCe*9>ay9)NuFIM$)5UOb7R*((=y?F)ugciv z(=-_s*gtn1JbctJ65b$M0ZYuT8GyP2@(Y#gFJE3)F^Re&_dVV48fB8V~Nbj1Fq8@de?l-IJM>FFecwcWm zL^tzr0b?U`*j(pd#GpR%^75HV^hz_4T>f<;-6#9??*3_QIGb&DmlXS{S>uv~jB)7OM2L8(~J))(%u!uOuuk8r+;%(liyD5T1JW zAHq~&4VL5U`}Z(a%8>Z`(x-%G|7xcH@51L<^=Tu;qTG@#pf>Q0 zxLKasOuZ%`R%C2%P^0Nr@UjcsR+8I>3eaBgxk|oCM+UASCRs>K=g)rw- zPFq!DbJ)1I)lTbS{?Vd|ZVNKRfh8*cUY3LQ8~u;J4HzA`c^Yf?4P+6 zseIFO=`YQEG}m2R66|vt2RnU8;)fm6`7u zS;U2hug43#!i^4<%U<=!nHau&pW+tDQox02Lh&Y!l5@su4oUN(pFz)=2y9b+w(CPK zWAVB8ZcJUZu?V}{ypl|ovu>5q;)js*2jJ>=DPAAB6f#*tNP!A4={vN#+6*2VU6p(~ zVhXU_S)`+OSb})|OXDw>HnZSxWX<6Syc^-4hRx%p4@=SXEHAb9-Pr#D4P2GEj2qbZ z#*TZs*nV2_&`4M(IUGi5^q1Ne+Zve;$|PxVSKhi}I6`wE+LI}qj8vV)L@Npe6)b<{xRQ1K_m2o;pLQ7hbY{$fUL>q(FTNC_a`XUFzd!s zK^jNTQ%R)#Ri=>s$JvunTm{O3oN4&{R>SJ*ri`5V^blRL)Ed^L(#YE&;OCJ+2OX$d zw$c+wYW&bsz5WUxtMfCGoP(^@)To|~e~X~$gE0H;MU@OVN0HypT*~R z9`t=g#1Mk2?_l5!KjH37@aS^UG5L#FLByhh{UO-4i=<>Y>q&Uq{PB6}>wHp(A#7p7 zDC-PJ@B|@kwr}#gmKs=KT->hQroo|s*6YhW51W%dO}*P0gAw*^ZQTLn{0C7ZVd*AA zJ@J}YYuV*K>BmT(9AC~3@!FuGF;X9rSgh1EROLVcsePoKYTlfzezr^_m zDV!7(g9E6VztS@bP;eBi#O%#7=3RZ_#viPSv3k`cj$mF5ltpPY<$M-7IL!)GQ@ID} zRF@9^@>r)uxxd@09-4u!$`=`anfKld(SRXk$ki{I-ONalcgHMq+ zmlSF})K_m@H=@o@E5XvJO3KHzL<5kB%D(p*+m`0%*5%SbUH01}J^UM@UkTq;Vgbqj z(njZR$>~N!nZBd*kQsJ`sV1{iqy<13NEboj&?&__#w|v8A@I@Rr-?%HZm>C?h zbDUy^hxojgvn^fF+B%5U(6Z4VM;3|5EO(2|WI5uD1D75EC+?Frc94et$x5!hyZUi% zO#VZDq+hWXYkACaskIiga?5)E`wSJAvC5{P{z<>|`Sz{+^ia(c=Ha3s$Z=bdqMr`fhKTvIbQR*M-g6qgJyXsX7 z$?)zSNg_}iw&{&By$7=>?9*Uru_XOY`TS$c9OY-gE)f*3*0q^3?iuSRH7L8w(A5T; zM+@LF4`F}wYQN}OSg7$e{M>Mv3L~>aBhtfbB1qKR4aeR{&KK7+Iii>7Kw$M8q_P{d zxoPs%yW@UKEz9E8kmkPy9yiI=Cq6$XeTzVZlz8{T%55yZwjF!)zj=49tcIY*6}nBp zq_5*4&)O)1Myn_5sHc5-fjHuoE8y)U6%0$xpRy`{nuSfCz+OD+_(O!cei*j`C{xvL zm7cFQRsJRvNj;AHUyeN>K*)?!+9VJ&1$fs_PQCdMc$C}mEL&$&Y)C%%iK5?JG#F2& z^cA5-OH;>iA>$4X&P`zvo~rU(S|=C7%EQ!^4;y5au(UT)yK@%6g!K7lE%{worLh=o z5zTf|K%wn0PBTDaYa6k+Kj;?*1X1zRtowO~>Ao~8<>>or)Sd#mUN;Q93RQPmq>K}U z+IOoN3!gBC)zR#d9a(%ipVAivzhvY|XGd+6ZTT}_;24(+-*4?Mk2MRo1>loSE84pft)#qtTU2y4nA-)Q$|ePstw=dKHUEg-cr47uc~y znm|sreq0pes%MD|we7g3LRF-=u@UTGXQz)~C^2|Z^+WwhQgc) z=N2EChb=0>mNtz1xZqJq^L2p2>ph3FTNlmhK3R|#jYtZNoQIVym{Gnp;4IU96Hwwv z1kLZ*l3vM|3TLH)FCYa~$r_npomU#WFq#%xnfFB2;Ns6ehgG6xKUzX|k>aB~3u>#( ztsUs4Ws9anB^7v-NtNjPH&NZ`tFxzl>BAy^mc?`dITy>QOo%w`ORf@$EQD6pCmI^< znh_4~Fraw!0Wf}6-UF>_i^`i1DpL5R*t!rdxJ1&+LZP1y=v)WwY`MxL&vb>rKn;_F zOKu^~&!+uuX{R{U+Nk+!oq*Zisw12>)vUUsG?QsyQu%sazlwPIh z+T|bM7~LLlf9Cuu8W6zf;%OBX|3dn~eiq2e^LJfUUdHsp&!27}Lvb&hUGH%b0k z_h!V0U~6XFzE#T=kd?vHcH{FZ>2uX#!~3eohOh{`(r|RH>XAL00$^lcaRLrn zr5@R3U&7ne;w3hPD~vEwsI)rFW*gwr6C_N845XpsCkTUCh%(|frO*AX*7{npE1=Nw znoY!PO$M3O-IAzKy+JYbZ}hbs44?MuOV}ZOqVUFYIzOe!`@{)7bX`0pQQz3=QVb^3 zNfghX25OJMozLOCnO^FdzY)vRT~P?#D@Ut@e?6GggSET^6Px z>sbSFIqPz5b;X>fx22nQ82R`}AM0XtMygsVKco9n+$<|}>W1GiADQj$D{HK(WM!AO z26q&9d(TplrGe>uZQGc*m8+V^U-Ah2#e1iJ9)S$GzepHVIqRBXivie>!Zeh`QFk8iudRZR^8L!1@2DKd7 z+a0U!!^1B&#I31T^l9}ZY6o?7uDViWTHXQ!kZE^oID>L6W}=xq0^T6R_&(oS@TBG+Y(DZb@15p86lD?MJagC11A?J zsT;qh=Pe27zo7@d`hKGU`i@{dJpx%SZpgBQyUpEb|0T8*$wRcn@qZ@i{2D9t3~+-3m3*d&^Qd1LDkAqiQp`P74y zxG8P+J!8pvjb)MICmDfp}X%#&t& z;0- z2TA1F=p8r-kGfk8-Xdv4J826YI5BwA?bY(bdTXT`xKhD`Mgn7tu8|x^KWi1?G{a@~ zmFo)R@5_3cS2awtor`D`K|4*yqT79Uea-Qa$z{97YPm|jR%aDSB1O!5U8RqC@`u&? zeO_D78QVWi@agrBRXTVV^ZL_O7kz4zTkp6fk<&x?5e7O@Ay5gcj<|c7En_VjO$+oi z5l3q!`S|d>EQ4y+hXd9AgVNsqJ`caRZvk0N|C9p*nc#43FaW2qaz$Ch*vyxKyjNDz zm|o=hd3x99+U3TQyCdZF>(~&x()G^K^&q$i`|Rnk(M4~UsvWmV=*>x++uMOfmVbf{ zxURp{^>kPVCty}Bog5a$@**Q^sHbIGl3 z@{$j)icRE{LAF|Rh26Lc)}qr*%(S0u;wx=gVeZNFI%-^NOESzhIEPc`3(fx*Ie>rHA5VE7FW5Zipp#9G(2

_tkgLXYrYJoN(Bx#x9vW5NS_slF4mpylB<%CfVN9>WaUR%TSFc$TW6oP z=o%UxQNRYGE2$%}gVj|Sf?e9~omxjfOq&&yRZyENwQFrvBcv(g{6zci6LzaLTMX7y zuebb`;*r7Qm@{jL{>iZZ$?uM3-#_}=QaV9H83eXEhY7Oc=%vY2{dFc_AG{>LvFZKL zO0juyUH`*mQS0Uw9*Xp2uy_DN4OWu1YtTM*qAt+DFW*NRw_> zQoM{a+`7QsA;fw#_q0{#n7Gl6uGss;r-t^P9kHLp9%y<;q==b+;-r}>$@;ZcrkXM&;Gm| z;YMYVALtWq%(WZXeMZq#j*>R-Th48vEwnKhVwB9kT4wvm&ll?yb2vB~KtR)%yW~Xx zNzXlozDYG8iW4eNo!#Er5eM9*!cqY#N)EJIHr?ga7rV!i!jh%}mTVKwwh4i>qL@sx zwx6zkr(0}d?|T_|QzqbN2KHo|u+yE|BR7VgucTv?YCNYhOpee;JDQsu!~)pq<^uyvF=1O|3_pbVs^=)Z_F$W`*yY_Q#*!XpPA2<4VeffH!OwDFGY^ zvl$DB_jif_?lYo7(dUac!gKt6SR3*@(`u|!x#%RI5l0_z`uq7kOtEG;`Dd)|@Wtjp z<^YDRtCXj@bi%;9_c!C`PxD1YB(dQM{8ZVyF1g+tdR0 zLyY|!_w|pO`hTl1{ksfbQ6&Nid?M=>_+;GVL}KtkgsUQu8D6%!ocJCXyM}qk^8E!% z_H!4=c>y*SpscnCy?3Y|TC{L~f=0%Lw2Y~n(@7|e>oGfbG&WZ`X*x$NsDTuMHcvYX z7}@<@t8MI?%>TV)gUMxvuyp$jw=5a|VYjYV>i;PS5w$(mW&eW!4QcqDAHHgQ|5r2p zcMPs(tat6`klZ*oY;~<+p7Wm`wUt{ZK_6zc_!p zAkeMizXKZo|Cxp$>G!&T+A{wNPE4)9pCA2EGh`>mA$GN`~S~W;>mxd68Qu{Ajzx8g-Os$0M@G( zOm3fA?RlZhc!q^NhfDZpOU&)h`bIPOXDE;iQTy_{de2583Kn@7qkoI|#B8%a} z79^F=oCLjU0*bW#lP6vofp?VfDS+OGZ>qCYGZLVwdO+uRRcP#@CL}z%kwu@zpQaYV zJ`#JySqWj{BQ(-jBK>VWJ}>dwXg5Tn=FOOIyn^|CL35g4a+Wf zH#O#-57n=)yU!hISag8Ads;crq?lu$PM~}n)4sa7eUC2_*{X*N5w5RG_i``Ugrpp2 z(%4To`5Z1mjqlUb@O)A-w_n=NIKK*;C=Xt(%gc}KZjG;?um&C?F^$Jf20ktb3TvCI zw<0TTKmP|G-=&0{kiDQ;C2k8;9LXt$e>l$QmndEAI6WeD6+>SZ$Rh730sycNee-~< z0MLvY?S`4*;oywZl-=5_U<60>OZ3^q$oJ4x{zCKB_9B>2p>U~VYi&=hjx3(xZGPQN zec6=c*+>q{-2fx2sb`y#{pa%HipR@zjf+0y7QG%Z)>?l6$pa@%Pig+_P{R>q3YK_a zx7`b_2Z1_Yg4r=L3_#-cjoqHMFPv{Q_hC~8>vtA@&ZM6mKO99Xm3wn{w_u9{ z)udQYJS5?1g*}&PQeSTLTrLe-4J=yK51$>Q&zrRQimx>;68Vk1>n;X`oA6_rNWmSA><5U(FK-nkr zF94gL^q-hSj{jm3FaKvI@u3x00KPDMbA|oWF_Re2K)yP@EVWR>)IR=f%s1iM*{y~g zPvkbYMT*1DlIDBBtC0((n{>h!IsPcq`dszBOO%~? zf8ba39cCl0GQYBZwyn(f{z?Exk^--KKq&O7P-)vyNnVq(i{WI!$Qut+RlqL%ViGIv z-}x6Nap0es#AnA$;@x!|&F2eAW;a z-M$bsa)vBC64&8lMc8C72kbutH5#`S4GY1V2AR2PX%3fG!<*`C{dQD zS-*TFp1!4znvg+NZj<*q1ac^F&J%1$l-b#`)?0F+lO_MTCtBRcX|}hB<q1rKcBkE{ZD+NQl zDVqfVj2oj*zg%$5#OVaaU48S3X_{b9LdM|iM4k(Y z(dPLdUqd%tALm$EeZ_ZvL{?uLd2-$d@yyXPmS@rG^XVgao>f3`Th^=eUi>#x#ovS? z(s(z~qhmMA*oFO^RvbK>Zy>yDvz1Oj)Mi1w$=?3D*{q308u&y(c?Ns+I_JRbf*S6^ z(`OwI4T8qXF3Z3w?j;)+Jbv<#ks9Fii-xVsulU-}&p2ER!+8lBB~4ZOL|^vqvv4iu zRqs-q+_`@vTF&39=L)DXjMQtD3FMW=E{8UWsiAR`A`pfrT71k&_9wdjToiwl7+H*Q z`{Zqt<&KW(iT);7<+N9GsXoln8IkVvBdn!s1e#?GhhX2~{LHc}r*AvR)9m7C-D?`) zZnc%%S?^N@y;f34hR9qY<2=ft%b~X@U*lm|OTX2+cxx9jnKOoC`JmS=Ga=+@U(9Y? z+I0^ZOKcPxyzt>*Em5>e+*Y8LZW8&kZhj;ml2B(~CF$reNq)vnyy`OkzNO;Q>~Fot zHcK$Fv8MNVz;jB0?5CW&hKu=~2X|Kx_5II(9u@xK2OmOF?y%k`yQgcor3Z^kH^f9` z?jls#hWn+r0xYviwSdC#;yaS&!G5KuU`2ys9ruMw^_iV`$3(yXN&>Oolak?#EOgW~ zlU|IGazfrr)NTw8Xih`xk~ix!+D2TpCR&ibUikI@~8 zXI=QdjEKMPxz1033zD(P0jL~rxqy0oG!c)g!woR-s*j+h1=wIr zOIpp&E7;cr)c7G>7gofJZnH3Uo+4Q==H1jlVi(~dJ)#&7$K2gMC|@~hWs`Gv`U>3N zbRJi(2Qzp-TVHppF{dgTCQ^vW-Xo1geaP9Lb(=bUd$TjzxM2vci!=Pg=Z- zkYRNw!>>{IliF4FP$)T&0HBI0^9)kc$EafVsnj^d&JV-10Jnv8cetA%4)7ibe?#C( ziYQ$N1u`#3?`9g)9UZw9I!;{0Y`->LhsY?Y!!`L z&Cl{~5SMyb-hg_Uk8=~#ns*2H>gLFOf>Nvi5}0z(u4ZUTmMxkPXTRfo?z^$JNv@9n zVdme|DV9qzJ!iusCq$2`#+RN1Vq7FP(_>}iNlsxv3r(!{4|seMM{imh^_%|Zv`dQR zmUW(yd0<}h-c6px4#oGCZ%mO(aEH78`kjFi3Z@Rw=bIA*9YhfA%tnFS8=j65rA=bL z`$uujM-C|<$rBiv5K^yrVjr+bbb$SPr7$TdWeXm=uvnxJQ}<1-y3A?!obTnnxrcHB?zGdY5E%GDk6exTttHfwRwo|57-<) zEfN%v&jfwOA-3{t*5+C~TTNNf!SnftTMOD2tuPNS5rTS!jC#UBDi9|>R>6!LAi33r z4UDUKty;C<-5c3C5~VKv_38=qU5#x+Pgf_F2!mVvpv9I)qQ{@ceyXc)8$8D2PeSbT zc`T0-$7d_Dd4!iokH7sDN7;<-u~WVEFDWPJNn7v(?FsSGl*wkOgxg|cV&kRpfEHMI z!Vzq7*|Uw2SF@>@=qO0rnt+7Jd1N&}8k7upH$fm2)!#s}#eyiD~=%S8s$;&6evs*`1ISCe_QQ+!`C3p2FAJB4{XQM87Uv}~b~K^`#*9_w(E+^*OY?jQ zwd%f4PCIFm;xl@r?L>gh(BRrjV8aSxc#@7;$XsK(wj!UpZ0oe?f&#t?CT7;tUd6!! zk&wCT?CXCBp5A4`j;R)mkBFL%on70Vr)JH9+8`t2M%}xR4@{_#p~7#EtMZUa_1>lq zfXRF1YN_>T<+O;tCggBpcUdbzxZ+kK70d~M>E<&5*MF>{{SGGoC%C=;dzZ^ohB@~s z-l9zc-5%ROuNN$@SKvO@u(!#DH#gAF znLbWPDDh7$k!Z43%u^lDbQ_-GoyQev;a|=B`6BrNDe>3G?q&e=zA)~R%jxYd+srwc zJI|8BO3b8o1CRpcCZ?>awg#JKQtNS7(*KoYRCfMvk&G?1Wnga^1M0d?vzY48@;H;G zbX660cYkU`FTVEy3t~`fTQ#OS<#&*aX15_4o#{Q1nBkQ2`BG6i@T}H z7GGPTC!?PO$M7-JNDvgz64sE;Ge(?}r`O`^w8=1=o#J?gKDpzslYnhP#c$*HpRHD! zqa@tIn+0ZUi_;)AiPoN-0xMW}YmdAIv|h<# z<@oi_d_oxK$hxYQ_^L^Dm45*ot&ai6b1M}1t&^_%#T;6L&NxK=L-myBrGi86cApv= z;*0|A4V5R5y@bKb;msO(GWugxa=ZT1a0PN&nBy{~9=a-1Bl_5_?nalFuS8&5uItXU zCBx|Gmgd6cKcVMZ6WmYL+33Xnf;CJ4I+WAdvyAUNA;Z}0otC6Gv7{s7A<(A#L>eR0 z!OILlMi+WF7bAMtmCXb2^!?i2 z?-RFIC+>}+11m=S==gNFi^~XAk|H#mt8{4IUH|?@0?(iVr@l>fV0QIhk2?BZa36_2asB*>spkl`u({W$Q*Mdi zbdD&E#rmeI)>Smf&L4M31HDnlMG>8t&3c;Eak64q3QxMsP}`neL-H^W+h^xf(&B3s zR!{JmKqcj*V+24Bdlp zG-J)Na6THiu>r$U!8W0nIID_p4VtX|)}NCnS$FH1HQdwE9iNT9 z_!(kUHwyG-T8PcE^6MVDo_f40?$ZW`>arLd@oy|r>aW0C0+A-rM&iCh^3#CID?8!y zj47l86vGcnc(dHc$P5zY31iW6A}#DD&`*wRvXjEdv;bvAPkJd-iM0TKrJtr_tl?QC zactWo#l0pm!J%2Zd~Tzk`S3@nv8mPbl#N);>$l@xlsy%7O>#4|@qk{-V_Erl7Z9hg z@|9Fo_Myu&e3nvc2dm5ZFtv}W2L*sGq3Ea_p_BBGpLSA(1${qD!JlO)OF3P-K( zu5`{8z%@L^_XFN%jiktGPn7^1W>78{1+1>@J0iV`spv=L*74>nqThA79!#?{l80vm zadAA%y3nY}aeHlU2YtvLH(i$8kyDqj_f1^^Sn?47OM)zR3+8PPF)lgg+A-CpWmHXL z{;o~;)qnOll)Ke&I-i(I?Fam95K7Y-<|zes-jnrP@)02W$n``=5>cg+O#h5h@?9xG zAS39FZQeJ{|dEE6Kc-fP_w9Avy*;iieZwU4nNV0EM!6EAo`+_;0yFRw#fxmU~D+EwswDQ47Z9e~gs?3#K?@u&R-MvDV%$ zM(({b`@V1`Z~L6|NJ0rO>G@#l@uXLNO?(YBedvOo0kVpFCDkuv`)ka6!j`lXhe^Hj*>^IHu-K1dOzK~*}D=5(b5-XeoPN?|k|Kk+Thw_63 z{YlK`y^li~&H~^ZZ?D{4&y<6vTjEU%O*P@oJzU3k)v$1-_v$1g(7E`UbU`y?$vYr- zx4CPcjnCBCS5ah`G2s-_F7Xp5zDjuO)+!S%wzgST|997}gGH+{6AUPhaj}NEM6-e( zS)HD82zOm+OvB9-I3*+q|-3bI$1dYr#qnepC>KX+2V3kix#?M3whKvXA8FXSTb}9 z{LgiTr8n=qufO0&YjqZ=G5UHhU>Mh$!q_FC)PBz6fD7bwBhcWg?m{NIY8EnRY*VZ% zG|aCLU0g&S8=cj9VLywbp(EFl*T&-1>IN_Er~RrRGlm)Z#bTsQngc&0>Jj~$be^mm zLQxrL{nh=;>-9U7{HwK-`q$```VGDyLOb%XF~6n1ivWJdR&0R}-8h_6U;G}G2wU^K zNr$-E<>={112sKw#A;FcF8j!EiiG1+`7H zLZK~{6;ldMBToPz3VqYd4<^|I*xv{GkM>sLQ_=;E8zTV#Q4>o6+@)22{{#5^R~_}H zn@jBn9d;DJWS&MgZ4`}@bL84J>GGb8_?!SWMvyKD_7oSuk7{U16$}3tdv5{`_5S~V zt4=y6$tedRoU}M)E4vsf5s9f}-;#AOWZ$WT5VB_*DMrPRZN{#$@5?ZQF_fJ#WE;yE z?)RwE>3o;__y7OD_wRo%*L^P6b{v>Z8n?y5hPYJXAv&C`=`KzKY1yxb zb)f%@pG#^upU#)mcn`wblyP=i6ZY4Q6e`^q#byX<;l3;kQ;)#I@4H0cqG1x&s~6^d zc4?S2r}qzNcYJDv9bK2g8FyckE8)S#CO(4?OurG6tLxi>2JR{CRZlHtX9U#U0ebew zM`zfAR9qgUYW6YqIkaHdp(t;oI!n;!dICuPvyyp?3RPqhKNC|9g(!K9RM2*AF(!`` zhG5|4O5#$=o2Dd3jc6(GsvXsl=<(qXAtQ&{_>)!~S84On8YEsY76^5W8VKM&3x3il z(Xw0j4UwXQjrGAuN?XeMi%mOeqKCN=M-P5|PVvMs-W6WuWWcNFfh?T~vN?rkd-gXN zATqP22b(6_=USsRd<={`d5;|#47I4?cY3I#UtK8hcrsgnkbd|bQ~6@-8Z$gz;g%O_KE4yZge8m|Ixla_n3<&z=v zwhQt@iY3#(x;r{iLKenynAB;eU`Z@6O>2&2Vc22=A0EPCSfU<>r?*RDvE)76=yQek zEaJe9%uLNi<-sn_226h(6ZPcENmQ;~1htPZLwcsk3gw=&UwpfEX)PjoC`v`mEFwG1 z;PXlY%e>oLRijULdbWJSU_D-@D14rf>kwZx#*;a5ikiG*jXE1(c$A@o0enJ2NV&Mt zj#*T2^oo%};o3%}1-NS1zGjZxrl#e!dXPwV$A%zw!35e>*H&tPTUhZMI(Y(&JI#dW zJddp*KNFiK;|!gC5;?HJp?&%Lmhyi)>?kL((~-y}nz~hHo{^G8h@U9w%5QeA_Mvt- zDeCF#Yi^Ab%kLNVl9zNRY-I7~RZ0T{c1>4Bs*{1xI5xN0D9c6sG1q8n^5QDfa#sRh zWI5M5Z%Y?FAjk(oe)_f*?@PFs;Ln>WQcp{M=@E-#sQ@Ohb3H0xN9Fb-t8RRHK={$oZ{7egcjrgkAvDmetnlRr_!4rD_qozM9Lv)W zfTutw*00=~gOx!Q)B#tyv@`I+IFW!rx zunuY1{i@p z1YZoiz;w@pC4eRPN2ZOvdM&7dPl_Ng<8(l}dFdssHC<+b19p-<)CXY53+MgxueWlm zRS*-R@iCUETAeW(74P1>m}$yez*~r=tJ<7=>BRAotsq(IljG|0S3w;&^YHQb!gZ zwRANvthSAXP{hqJUJp00lluxXO^r6@@dua4OJz0-cPgp^XX(F-B8Zm*$a|S-_fBuC z4W@b8M&UOFM6tI3XbRiiQ{^Eq95SAyWuIt8CB(lnBWUO^oqA?-!QBb_PL}*}RB_90 zw!HA{3y(lmE{vmrfMI=V0Sx6Bk$g3UYSi5TwS$l}oSiS9!9)1`+ zMS6H+iw++&hS(c&&b>AHZAM|Sfxjvmi8(1PPT2k-QJ0_g($VJBj%Y0TR>ffQENT5Rfir6HQ5|VnzRX}t+?(R#@xkC^BbR2#p{LW6Wb#~vUH$NC zI=9>Y`v^MF`eC8Kuza4HQKzxDHI(E~a`xHbNWJIl)mACk!%y`@LV~JN@fmh$Wf5rPf+UY(+Lp1aNHda-9^Eqlokc{R~TjIcxh^cd__Kb2j_fp zUFxI5&VrHMlnl&wEAET_G7ZHXRyVzZQPj2{$kjJDjq!J@i`e0f3$bRKOnb zY?bK_`WBWlK+4|3CEn{BbNJ;q=v3tA(&H4k^5Bzf2PlFSaIqF^LJe#&IgcKJ4YNVf z*X>=zas2}ykYr!XyV7jRBmg<5%mLZZ_SAT$oD=q;U(C&|sL002XV>eu3pqqUsinE_ zoW0y~bJUES((2{+LA_DO!|`k5_3q4&hldu1Eo6?j+=L#Y>|EpYO{VCa(-;G}2PORP zVR9un_Hut4JA&2P$`yh-{kCnl!w}{{V_gP;1&U@E*t_@4?r{kZl0Zry7&&9TPjs#4 zV)I??XsOXRgxD#7gfdoeT(!Fd8;a6li`hRzyl&*EpCBWuv!A7c+}9NC_Ukx3Oc(58 z7xgj+HK>l>V5_$Zs=o#ykfe6n=wf zVK`FZG53i@ZNmN#g+ME)+{{hznp+kNbTX%XW?qB3B<+7yaqP>s4v?@xfk% zJVBI_4gVOttuv4Gr}{Z*J;li#_-cqss!$X|WZ@AnZJ+^d*%{MlV}62pwr_GV_hzr^zm+wURft7can@xOM?=!@3^HuGF5SaUAaEm4YF@b*=iJKV&X-5~ z<<&~HpSctKt^d$)=(5u8UUtyqY-_Y>i5YkyT5X|-tiJ>IcIt{G0-0dX zv8ss=MG~hP8+yvR%nRN$p|iq{hnMcviln9hqv}mf23QVEKc640laNuD`WgHSM}tpITrVwpTIh(=+?C| z)XZYaO!(|^MQ$p6mLN?9V#Ti~_kY81?fMBcRgZ@nMVn=;`$W=d8EYozE1pyPG_&$i zZ;~_uHfz@=fIbue!KL=}(4ulZ$MW28@Eml1+g>PPHDI|Jn1)|?(~%aYKm7>j|E*Bq zA4ORI4DA9AgL0r=Y3)EgneD!QQ_bsxOBcUD;!XgGgJ?%P$k2VqL;u0w^>%7(?4$u$ z$_>l(VB~6revM#@y^EIAz^J3QtjVJ#H>k>U38dX%ktmOMgF}IhTrj={@oYHtt9$3zli7`)s<92S=4biGD$sVO&J}_SJ6>2 z4HQWUDLJKLBsm~9Vj6`>4G6e+Q^qup=6|t}s>L>HRNFqS;Sm?{+KP({>5OXj4CQ$3 z^5iRx3$$kHfX2P9OQ&&Zses0f3JPjBzlCkM_K!5KH=V{^_>;!{bbpt|jotklAQk|; zh;G38<&p*9q=|D!3?=wF&bM~{!PcjMTu zcHexYe+nI3&DZj2en1$+C;Wl&(%wjxP_sC-kYjH4ndb0Bof z4*IsQ6=h7Wic#cYEF8_ZACT5n95TI(v6bcN<9)l*-Tq+8;hPY< zw9gt+Kg;&U*8lOmdC4+aNVs7>;u%bdJup)4bLksCu1Xz&P^aTQlV-oQ>yU;Hr3kzh zvw>vW|K^ycZHOS^drBke@7DiQQ|Q>m-r(?WrqC}u#U7MbRa$_E|0MPUm3w*%m_pru zFonV?O@U+mG@ieiLQm50ShJI5!VB7VQWf}* zYO?avNNmV_+BSDn>-YeqYy8@FAN5;N|#K@eGNP%n^f%Zrw6Z}^-qB= z2)62ZsI0$^{~|VQq8{qA640n33QrYv0T1>3uF<+X-8A6R69{d|Y7mTM6NJ=?x78N5 zP3HHir&*nWlo3bPR;tpjg8URj%1qerXf|BC4afa-e)YyH0c8e;gD-bqJMe){Us+r{ z!s%G~#L8TBY0k-;Kj%E9@(+VHqtwcvV3|Bj7|H7;>E{@zxl`q$L2AAIoWH0^O*>;~ zR;znZ5K`1JL$EWSaOK5J$tqtf`i@;ri5v0*_C@HElatd9U7cgJyH4!lgad>$r5}8u zJVkdB_?8yuJT1LWWQFK|_l5exzxhH}OOe1An#=vX4glET5_AupRqWwvJm2p>d;m-; z7y?YcVY!oZe1hSB50<;1&Ud{1J%5ajw4aTj8xYv0+w#gz%WE#~-xh9Sq&MA9v!; zesUUJ-Rbi>CnpH4BKto9=Bm-bTveTSO90GOQ2=1>LRR z`x5}!0Nc^}?yo=ErN<{2wRaYF`lO4ngzV=bm6^hB?UcjfEbw9xg8s&l{w#JGMd?S< zi^B61R!x}#&w+CIPpt`X?!^>Q`*A^|`u6M-JJbvj3_d|KF~K6v$%QM*59{BGU&$gB zkC=8SM%|piD;CIr&_#pifG(DI>15hSl4ftu`GXh50t&_J975~~7pQH=WB9(5AaU|2 zW!;YiT|dmwMdgm`8YuRujdmZ4u<>T4rN9DDsdC?(^r%b$Nw^SlQ5v7r5`_zh#}r9e zk>s_Foexok!K8gtQBXf!c-2L*O@-h`v}Pq*XGB;yX1=m1`{VYSTd_km-@p^HR^d$&Ytrwjl$HSEZtUBg(~IN`V2Qy zgnDaar<8unwgKl}wCt&TEZrb_;MiXVQB8gDrd3tQMt#1Mg>rJW?w8V`_Sk(8Q%e32 zLHNL(B;R>>-fD6f%_=;I-dApC?dWecGMKQ4LE-%sHL#OwLjI}xlp^w=c9Py;Pnc`^ zmSceS&xhlo0Y<+Ux=mty(%}U6X@w%wbIE*>W_F4pAKk9|KyQ|HO05Ohjwr}VKplz% z^A~>5h~f@};7$WC;h;84l2VIyx3gIzsjFNxzo~iP_7cA3*LUqFyofCqfIZYbZQs+Z z2ucS9J_6sKh$Q$K74 zjO+RwbQAH*X09X8Hj3JXP4Si3(U|3QjjK_P1tv3)&MYVRoAow_NX2oJXK<46x#F1% zWi!X6h<7E(0_hG@F~<0-yf1L+*X;(MQSN6f7(IfHUJlqcY4FZ<&t3)Mv8n#82TMFL z0?;(Ruw9er5!D_*@@_N8Cz!2 z;#m0^c%~V0Vln6X)-fH;#z4woL5rXrVcXe%T~fzs$K&I5kMP3f&HD0zou94>TGu+r z5v*%7&~3p-H+2w2&TFg__g1W!Z!l!j4UHYT+eZE`P>Vh`c5$cd?kNM9qohn^giQ~A9dT=M6pg4%J`#8l(dg~kDo_HANf;Ysq@_^ zceR$w-Of&~#BAoI7{Y~1SzW^y0HVEN2Z&X!3xiw-O5dX!)Z}X2pE%G$hxWTCK{}eG zxTb(s%l96eq5H4oW^2l+(tO0ZQuT{u%jI8e)Qt-u-NGyIjj1E=wF>u&2ece)Gm15X zx)i-(%+B1^)lx`{Y@Xd)ch#3?&a?VF%HZSmdn%ky$(By%QlnVZrNB&+iTZ`(Q^nhT zmKS@hhL%q2$8_5^LA1sF4_YPQg{vOMV*qWEsaGb0oOfAJaZE=#?~#d&m#2pSDI0tw6V+FwlI7(y{~v~+xAFj-YZAuv54$QP9K{jj$FkD9}!In0k;X0-dOG;@)`2h z+LE*fA-_MiV`*ero;Ke4}}+=4AQ@svpNzCo_9XfEnVBP|>bkmFsyC{+ws z6tIitO8klDDmJTFPN0z+?JD7&qBR4a4oa;`sg-$#6t;^dd zxp|s$r{3gz>_pw(UP|HyrFfjgjk}bbz$$;G#ELV4rMhk3Fzn^M-MA)Br_O}SnV2ktm!LF zFIq8pV+k!SjJUOx5lY=3^I=XN4&8|tBb=X#3GnzH3DY(^AP;#`@$mwiO)ng@mBe^V z*x5e$T46Tlq-vy5?RrwZroQ@+c3aIlr~AaGRrHB)ef3t&Q~Q{qOYcg0Fc+0Ks}F^+ zh|JTAW#+XY_qw?vZqAS28e=vZ>fn!Q=c~(07FiZq;m=)~z>2>Eh8M;%*JH!P3~W@zZgU-eW4~t$ z4x%WBRu5@AqKQ5zsmz@C#@Mvqhi3i|CqpwKHZy|ZY$O{yyCs|axs@YDP*gO|RRAU9 zptU1SH<3cmY?Ly50Rg!OHMIsmM{Qqy8Hwg8AB=o$4Ub%SeVseF;OJm+m@tX4WwNRo@FB4tNeG&FfO~$#LDv~);Udp$7dSKel`h@?O6Y5Ga>^VNl&loDz{sDPU!IY)=BiZf{ z>H@1yx5AoR8hzMX@i4C_pVMIsKK(?a+*9D30q?CL2d<~3;^ zjRp8oaVr7^_gXDZy_P_{grrqdgUlqbk2p-7#3OcFY$7oCokEnG%R?P$%Ia!4t_RWjii6DZ8v$465K&s)9%6JDNigRx;tWOr4>!#P~ZFSJ$U(2Q3t zeJ;E5_*T9;DV@htG%~J-jfF!v0ueFfn&Jj2IHW6ZG;xhrpoe#s^OSV^2a|z2Z7{nR zvU6umUfUm3@_0e&mB>uqtm^3UuWhe01;)^0JVZ7J@8H6fNWSDHsY1VBIgAKsT_Sq^ zBjr{CBjLb30vks@GvOPDhWAP37VhE+dFYS3Et*1(C%FYu zTf-+8KwzD724{(EzblZlQIzs*Mn%Ck;X11c3~VatN;L8nUI2OG=WyqiKR(n#rM z*ALi2#(}D099J^km(oF>$w|-2d(`ZD@RX$2k0#1-dz68B@SXM$#c6@T^Vura>r7H& zv5~g@afz-6Zzbi1MZ8REy+OEAWT$2Q_Jby8t%FYqxi*70JS5|>!)EW)8~*pucdRiQ zi7sBZ1C43NG)#+$v68`3+}F<9;XK{>IufXn%X8-%b}g!q#rO+_DMPG3Gk}y7-LD#G zFXurjgzviv?09RXoOaeD!LlbV)+*ZAZaAq2d;Sd1 z2CSlx)Uk3wW-V?N$h-IDe&Z?))x+1oLkb@2pO3LqIHjH|VW+@62mLj7N~?7ioW(&6 z+(rBGMsLrwmGBm!RYi_bJq#-zVC0j}(Ks`y$9o;C4G1H9b_yXNo@z;8=X|xq!3rRN5bob z193CX^uD=Le1uc>sYc4f8}M{IheWyynSK*3Z-w9h^K)Lia8H1gW<+|UJ#N+wtd8m)^!gJ zrNUJt)L*0ZX8Ew@3+5hS?6Ezc2)lSjcNn4q-evKdYYcU9Y@H$>s|!!*9TU()$!5~- zw%FIE#{V#Uz1Toey|ier*#V?B<64#V{VqP&UDF4AB8tOa^%;+QIaL;H2UxXM$smF* zB`W`k7FULGG_Fd_%7jJ_RPUn2BR#1XCGxVlpEV4y+o!bC$zr~`C<~+g1c9?_X+~TP z^igc`lzrMj^*gtxZhjv;ksucHj`j|oo;4SHZqjt?5LDbMnFD3t*`Pl>d`5)HX}e_e zZnPkim$>jglRP86p`--n_Bp4=%n21(=ieF$9W7hgt7j%WIKVybFN$$!SkImW~ubFTTh;vI~z{Y z#fS0l>>W|!yaF=wi&$_pSg?B_){tKW;{bd?8Pi$%*w0a;l+uBfPi_@gKI7fEN- zWQBy@*2LB2N#4J$Pu`0Rc9>XhNh}*8^Ck{XTxH5)>+^jjKk{p5W660IH^}}`2c>Og z&LOoAEEDv$yTFP3-?#6EgItJ=Rh4+eU%c_H;F};Rb{ycU{?|b^L?u30F*;Iyw#@Hy z$HF&&?jh2spFqEo9YPu> z_WHyHLz}Z-_8N}^R>a(R-pgb>RHU=~e*1;wHlfXbU(F4IE@v*RLCRO(6)-K?U-RWyzum|WNPYPq1n)95-Mjj9MH^!BYDn;Om}a{c6j#B6J~Ki0r__ z@csgLghm6sGvGvTFpZ{_0)@FQr#FoXdnLX~8=rtuTB3k5RA5Z8N^JpsLfB*9dqs?W zA~N_cF24Cy1gCxh5v6LxKnf<8R|E*~4PV@sw8FQl)xyd^SJh6nI7bkbDs40*mB_rA zvL&GFV;kubFyzcNy%M zG!&+;>d+Sd>H6B!1p&Zm3x6ZvORkuDccHL;%K!PSYgjN*vpIT z>t-?72ioSGGR46VR={ka5^{`G(jQ)ss=Jthwo}7GcW{! zRhVpHwxS|f__`0(Y_r`fBr2Q-o_U(0bkL}MH>b*3`^VGUR+Nb*D?>XP z0&1&z!9spd_v$*a3F0he@BAqqHbO=Gsw~$vwRiM z#+-^0EL=ZGQt!iZGy5S@iz9Ju+r#4pb63Z=zf}1gQ}^w=_h!c>Aol$9z?T9;Mg|#P z@OEIQ%JnVvCz@@kE?dBSx1)CESFN)h?2g$Pj_V&V=JkWI!1IdRJ!DK>gBOK$7|Bsx zQN@LMlJmL&kB2i)rj!h#*wO1Ots~O5&Wc>8_n#NZ)Bvggzh2PX`zgDWRaIEw;gIby zway72pY%>Dh!XLGv`C(wD|>W*QK4-~a5xt4ENQ2aHBh2h56n(Kfrk{wKef>fk{$^j zT$uS~i4})2U)S81@aNlD-&I1j`^Sah^h~+pnZbnEPD3()D9X#ruMaoaACM;uYL zoZXV%+jR2##gN`BR%!vR8D-y0Ua25ru`4n8j}n%QRF(f zSV%V1lrWvUuSq?TYGeGlab`FHA2zNQZYbfGx({5KR%=1KjAFLKg3-&aG!~bsjg9%` zE$9GVM9t3)A#G`&Vzs}WNq^&zpWy;U{ubaEF_M2x542~c}ByfxOeC)#)wOC3RgG;813SL3>nKOer)LsFvtvC`0ZU>$r_)o@cbh9_Y`JQ?EQbTQ3Wsd< zWx3GAxYN+Rq3q@y(#$DXaiMLXGs_c9`RFA51msSUNkiu%pKAQ#W!HO(YX~)3O=Pwh zT&Va(w$=tI_%y#m{~K#;l2X0t3wahgfokkz^6m)4RrX|4 zX_$alS9v;(jOeF$^RMNS7_b7Q9I|A`fPPXQnaV*=OoD;k>(v z4-WwarBll}Y3f9_KDQ}gh**&~btP2SI%gQSYmy%LRfqY!fiu$;wM`m#@lnYv^R&}X zT_uP&qqS!)fZSh63%pe#LaDw;x)5;4+$npb7gY7WW5@?J{{@B|uVI>Iuq8|_eF_)6 z2N-jx1tBk^^m1l3mFB-;=DZ4(QU-gw+kf;xWnMbDxf8RY1UmO&z8_?Tu@UVc=q+x z5JfJ8Ag00{2p5FX2mL@CqZfrYQxhtP4l_Nw)el_9F9*kx? zmV%DG(|FAVT7Y#!;d0FAgmPQ>c3rzFQ$xa=<0x(NVtDdY79Ct(}TM~pal}|J9{lMMI&&y7P&suHW~ua<3du-3;Q5~q`-c;F|9XzKIDq4 zkBJY2vpiVGx3EYGx2j}VG=Rsl*R;eAugbOwGu+qQEkHCve$OLu+BWuV@BFcktUKs$ z=y8}F18#Es1ouOBohv}*?tp>#is!2isoUGpy}HY^62>^XSNE=v4U^uecWYstSHU^Y zet3S~ulf(8=^IQ#%kX(5*d4D%WDEvrJu^f~^r>Y^-K`q8lkuu`_g`3|=H z+ZAL~%hd(+By!flXLCC0IA$NE#_d=5W}Kg zSuH#_vM=9aVL(9Y9y7z)LAeW{5x4oUVV6(ua3kTp#dufaDrl+K3_ zrLTqZz57z4#V5_}_cVL7zgy<1h7)f%LM`U(L*=x4igRNAHuJtQq!YS@HEpr^3nSiN z7$4LGQ8dBfqVgtmt-Zurprn1wy)@k+@whiau^vsnwKUg{(Gc)w>Xu ziER7+re~0#P9$Y~z`x*HRR}W;HB{>}w^pxl!N9RDYF9!BHs?6I1QVZ@bO`N2dN6ni z8Px+d62o@7yiys}bEpKn%L~|PlO+3vZBQJ}vZ&SAuA({?+oE1*k;aoPpD4gbTjv!? zbO~t4Xl&PPdG#VzKJ%1jWqiA?hRb$yT;sT!df?MxG!D-2wy_TgC69xmsai_^S;Bdo zM(Xg;>8X#;$z?NGqrtS*;r_B>b#V9P7(n-eF3k2vo^!2T^PH_EZ1y|$9S(v=r7dBl zP3Q)?Bm;&#@$7dmZRX2ciJx^Zvj7nlCSX~-eezcEv18`6!p&+7xOx z+77}~?g=Nd3st4SWUL5J{R%~Q25)zFF4v?7uk=q2Y%e(ztemXM#wO6l7;)NAq~*5s zOZ|6YMQ~b(GwDXp1uTtfth}aV>*6E5{IQ^sd~7trBY_9X8nca_7#ZkasWQpr$K_BO@l{48V_jRTnyD3bmn+9_o z^)d`udZi>@s%QoqGQ6;lLFO2lXI#6)*tr=VK@S_h9DR7c0`md#s0T!i6GHQM-NpBR z`ELWH->===-D%SSX_0}lant`4klyPwcK^oEOUs{2@1%-8vHN^`EG8h7{x5``;j6C# z8yf8?v0=-IDT|vplY=y5G&;B>N{6(;7=(xJP*!&b^~JCrIzsi}pGCw2-k^q${!aks zpCIv{#mN86ga3)?x)h{+-k~i7H0QKl`hj`|1oSuXIbCPhpLpl+|97b6|A#Q#|0D3U zHJ%exQ@`o66J=v$(ZclNZG+td$$eEA>r zFr-Hc1q>aESK|FIba%53jdz<2n6w)i0G@lBOAbrdGwfrq>I2xc_vi1}G@lYm=HFn` z#QuMQO~()YD{NXld>5N;G`rWR1F-4MU2NJsh7Lie1htRpV1uv!BR1Vb$EMSRzDbPl z@PImY5fn3UW*phYL;EZW}e|fD<>1jjoS{c5ecuZcHfmOJeL-< zGEaO)2ur)myDe!c(>6+E=7$a0o?eZ3b8d6lnbx0Z6OF>FxTgcDjM<80!=_Z+XCkdG z$0yC2AYaJAL~N+bsuzBV5`6M9EDD0M~$jCGFaHnxAH z5-)KxyAO?eigdzWol4$++mVG>*n2bJe&rL3IWOR-i^=kiAwe@;AbqRCp!i8C#hU`H zoq{tm++p191iy!ALMra%^wwT5oK_W4SXr z-RkoDNjl=4FIL~C6AEt_ir44xV!S||ZMS^SCZ^?N!jupc$s8sP5!muD&n})@Pnism z-gHE_;Rt=PcQ&aE4{VFX!O@B@H#qu*>iIm-F)EWf&2^ZlioJWJ`3{sFG!#iXOSNaO z$tO0HBvu+)7d=fj=FVt&EY*Gz6iaAc1jQ0zJdH@1RWJp@e`}Ix+A6!aMoVQ)6s8RM zj8|2w)6yeS{c*d-J|R=$mTqEw3U{rWS4U=}n;~y;L&T16Rm%Y>?RH5A|GT}%8Scx1 zlS7b5Ufr@c9*eG|27QAD5|q)~otZ<66bpU$H{J*>v8bYKt`_RGOU z$~l%pCDA8YYD;ER85rzeRu%D))11Lkw?F;27w@@!`B&;s2DYZi#h-DBNYNgir>j?s zWWBATF33a@i_F%jO((wzpWmtWE}X&K8Bdc;2gKEXhCn8S+Nm(6>Av&~k3y zng{t~Xv*L+&({c@Mm|w18mhEqQQa=E6mOX;cha{v40IaiXVpPEYRpf@NVzAL6_;8* zcT4F>AD-^aSG-Y&i1YfU);ndelj#v4>hUm3QyhU9nqTnQ9M!RHbdy z$!dJ_ef%nGuW{ zSC3z~<`L-6yKn$VkOvg{=X-t>8m}V@D`SDsSgKX}Z2Qq~LgNag*GbX-rGjmp^SD)8 zURAu^#%1aAq5BzL1F_Ww!fim6f|$xS;wv~)e0=u|@Fuel-wSY&2mTdPaF$B{kAKlC z`{zylzrO8%{}Uh==E7_yd}Ft;fhkp>Gc*OoEseLxi)2B-AI4VFaEg}7d`xyPWiv4Sop^b z*_Z4f7v_@W>K-7qVGvL;9CL<)3P7_JyNo91qoSpE6J+g@W&K)Ta1=sfv$6(iUQ49M z*2D$ef=i;_gDPAv4-;S1>&c9?b(U)bb!8b<%itQ~%i?xUGh-GWk)|8$IG4T|Fz;q! z;G`c^!Fgq8>s*n8mIb@==D8;vMi1PHJ@UP}+3H1>$Q79h@k+J_TNi|qKX>%yZG}H) zH-EDcTVkFbTzcG4+u1)-cF~Zh%gyZpU1t1_`7fC>b?@lyDm|QW(QIxNt~C}{5kH~huot45C_Bx`Pb9? z<$HSU8a3xUJNCldQNNGnWA6!dze7oWy1{Plh&n#P%)iLp86?P?s9-moNg+}O ziSRrAKe@lzU)ve+Ny9h1g7%nIQ^*AEzzr+3B&cjnGI3`d)OA{hHSPGnZY7swVY5zi zS`MW)n2!{XEJ^y!TqLh0E_a?(+gODPMT&%DE|^gIRvvGuPI(|%zZC<{9V#H4Ah@?F zh=+_H*h2h`mtO6-ZjNG4%-ZFpSCy8Me&nSef90i3_piTgvw*eXyxv}%Jw`Fyt#s14 zr)W4k<^MbaO)c*{Q{H)nyLnjkTKAGprY6}YEkLpG2qD|zmz&JR9-|}K^z&uCV#(1< zh9@I;a^MkvO^j{jI=QcPUs1Z3@qxSZWx>?xg?kV4J_;LJw9aXtH}MU#E2i>!-3xT5 z=x@E5zCOAq%MBqCH8v68XjbblHJvCFiUDUndkq`;hmo#T8=ejRrXHAZYZ%6@Yy`LB zBhpS<9BXg!Hw_r3CFIHI7k1!aGhZq%1@K3jj~uoSn51B;mh2(T3cRGSeHgwX8D-cK zQ2`@qkL*VKh_{D07)JY9rE@g1G`=HK&+YJAS>6S_^dQm~@eY#I8TH1Gy4h7}l6i={ zixxNf&G(dUZ@g`T+Z#q-oYm7WYo@3pvdktU4Jl>n`3Y6N-r)%tB~q_^Vgjf#FO0h- z<)nGIITqwclxJ;!5E-BDeN!wM(+RVanZn^@%Ej$Nd#YCq-UTLDT^)0;Q?Y6*3VfIJ z3kII6OhAJQ-njP8IYWZjG{D;F;#4g5Uk7|+!K-XFQs^fvL?W&J3Rjer((eeBFNMF&fD`W=VxCCgRPA*2iDm1Q zebsqh*#j4+6jRRgjvTD&*-eh!UQ8m^`8QSlPQV%7vB8=OR(7HsTEBt5J2pY_#=TidL53A+glLBpns zc=3?lTO%)FgVq=9!*_hs$l}C+*oYZbz;!*!Y8VG&bx-)pC)I-SJR# zFg*m3vReN1fN`i}=aZ=zf>|$OV^HlBpUB1@M)&k5HuY7v?gp4B9oG%g=>VTQ@#;7KBl2Sar$`klzIO6W_e+$};$Z7dq!?M4^BSw@AYw3>Q@oU0^o8?n5GJ5ersMgQ($xC7- zv&$TyH%{)DIt^>n3sVubAIjXwpHQ*d#3ockZ^sQS7KErj#x%#I9z~{G~vq{7ju0QjX0DbT8=WjVVbZs(%y*|m)g5I@18^WaKLGzm6tO% zF8}2OdHSL@x7cZn`GcaXK8>J`0x&veHgNw87NnX@CUnFKR!<>6udLia?eGB1&+gl%LZ>mH*(>f{l zFf7#?vWHFDJzi+eJ*ZT06V zkjAwJ?@zn6U#pvAN1&f>KMPpo=zpCHG{$@co5hU@bM8*KjcoW=jWJkSoPdskh})uZ zvE=nqur~bS_Z^jHLVQJ~`MtlR()@JW0DUd>CGmS?{eNNI_~%~%5RE+kSKP<(U8}%* zub1}pzGnI``@OYt>;8Xt5Y@K~QjEec{{RI>GlVoV_9qQtYUc@D0?gqBiFKzO!b%Io zM(M|64@;e7vP+d)&)k&i-~jz!Z+2_IE)B_KI@J(xufLXljakmQkNrdDZu#o>7v^DS zSOU|3lUd+$xU-0i8(LYCNR2;xH>+3Yvw*85i0 z2OL|BqQ-!x&jg(NwXk<28d&-2I`}fTHqRHbYD$gHpS3PGSOlfgYuekwF~LeRouq}v z)%c*Bq+DP(zL>xzo2oZdi@mh_%n|J5v~=a|ptIfzB0VC2^1Rtt%=h$6y`MGje7c)x^n|d4zM3vZ}(J z?z#Vm#v_TXP}lKW__FWA@;DO?4BdFl?Pz!ngUYy-$f_7wyw8$#3S2S2V%6C?hTY4H z8RVl{r{*&v;fnoh-#m=EVR7J>I#o>4xpmm>TGVL+o34FvEFpK$4BKe>?7ZmGY&+T~ zzc1LeWXY30JD1|+bjb&k-Hp4e!Sy`>s{zt!=o`E~^L}sNw1(8ulm8JbZYSruBz{}W zH05rslBn3{$B~Ue{la9Wl%$xSjaMj9u`EK(Xp9a@fqR@+H>-i~;NJUr^n(GV=w+v9 zFj_8E?oW<{DXx^5JHX>oB1>**hVN3?KP!Euu(xezA4I(DclHrA6zL?ey-5DkqTJdU zf#3SA$<@z=hJq(qP%mPj%zqq}PMfzd!&AoBf(Y9~9%rhv)|Sy_3*$ZKyUEEyV`SnM zPs(*OtO~ZY8tFmTdtvz^?N|1VeE!B^b9&f5SFFIlF+Ld!q>Pcergm`|Z2zY*d60BbmE?y(LiK(&(v-~e`Y+qheve-8qn*&NXUf!^& z$Xwj9D5ru`HL^(>$mHp@%){H@Jxm*=-1J*LI)H(T69yU1fUS$8weg)(WZVtUTdU%%B@W8ZEOrlD04I5yb9R4 zoZ0TM4%-*n&-d$y`dKZmD6I`rz<|X*qqSGhuCZXM41cEUXGGtt<|!}AK!13EQVsil z^e_)~xQI=l1|}a~VVOpOqq?FZQ>@dcFmP@HM+&t$hU*DA(9_VK;GXpXtqwiBY3o<9 zOkD@+t#ebRae~6F;rgjjjYUC|*h|akwKs=K<@ufxttmVl)~S6=1id`k+e$XI+5#c6 z&NjPgrgYimuAuzU+fq&rZxI5ph1{Y}*utmVJkAR4Ud!+4@7G3|Q&l?FbrbcsS!;IQ zrfJc){GLpZPwsc@p?1XLXALF78`TLYtJc>I##-ZWi?PQpUiyQdNOx>Y44D`2B+;z| zI%^99O;hclJ8Vln=!1%DT`Tey5YpvNrooja6{SmK+q2%m3Gu-=#qcdJMbX{-oi1U4 zTAgAQ$0y9_A7gyJEUBI)p-^77(^%jXqa zjpv8S1Zy;rs2M^ATBt&RpA&U-@31E$W+Lja+@FW@!m5%0Xro)wYKQ;}Y|Er~D4OH@&i%6l+3v%?d`LDp#Z# zHlDZSh*Vd{sxHsxN~@~8h_XESYBUacd2cDW@B0JmdTLAF?n4`ZKd&^N@$(qFv^(@> z4E4rHc4M4d6h{C$en8{eW@m!RXc5i{kBIP>B0|nWpyIWASUwb#=CgZjAcUGrZC&Ds zT6*6t&r(m%p#!TYDbdH#Q4iAZGB*}X4`ggS6AI1$&BzCL!41z33PqWf2LkQq6ly-d zn7D{T z4OMMXMO!mLCn|_}7F4xVRh?>4l$0V=OqHQROfeKSCB{UkAvC7YibNvGjh=IU_pbH( zt#$AHFKcJ-wfFlb?|Ps8Jm2T@jZ)=86F9)??BTEi-Pmsnz=5M2^ZaL1Z_&{GC9w6a zs3k>fvy{|-io>3z(&B;3<_YKH3Kz0`rHp%KHbHqjW4X!$axtt`rila(F_kb6q?>J4 z2UbbXTAB0u#Igxr7MSo8J`+jjqrE>A>k!4njU*|0+Skf?Pfu3wCr_EGVoh?PzIFPZ z&@@Uf$vF~LWd)hXt%9kGD+NF^C!hty3+*FSYoKqsLxkQMsiUOYi|Ux(aUCD>*h07A z9_$dHH(J8==J+^#zZSOq^LJ?}aEq5ftfLk8V?|)*ilFTKj2gk?Ui@Ugxyd8TB`duV zVR+Qa>A@J!{{;cPTtd@wJ2vWKpswi8(xcV8ZL_yi=aV`Rg8D^)$fq0E=)EzoVyFB9 z`u`IFy5c@;NvK0J6~azjNdmD{JRobA_P%Maz1yXr<~*LCTHAn0eYmA_THanx1*w0$ zJ;>WR&R2l-GGnlz5E!sz-92%mhp4?J=;WKY5wF9Oao4(49&^!~4fWYI75^zY*k)M2 z?!P(FeXhJ+^aRgWUHer)QMl$*(nQ1 z^L2Yj6z|ZlF0k$sAXUCcj5+)}Uakv?`pFO;tWU{dFsMZfxUH8P7Rc4gs&8HuXG|vZ zl9k_%?Fh5`N0hObvVGg;k)HJ9ZE+ecJgij?2@3%T@$m)p@%GGxw0}TTZ%>Kr#G|GE z$qZuWi~m&WBpgLHrOXqjuDV#w@Hg|H;LhM~Wd4E3sVdk!%>$VN@|?VP?8WZPMQ`={BxcXNl%OgzRJXv`N28#d1Pt9z4%t%lL)VD#* zIuRD)m6ib{C$m|5=~M3Ty^Z(CR==9r{4gdWY*alSd7fwI7eZS4%Q3lmRNo)A4F0rvrhK{4&`_q)wqt9)6!;!Nb<`I-5HX z5^9WH*1@m(QZo^rI!>R6mA6K@6M}qeblyMg3eWg$bHMY{@GSp@9cu{ru^WioK_V1+ z%>-*(i741OsKz*@ht(|$Xna#<`Opzw)Ce^;z^4Zjg+y}$OlUl#%myLA1uN(tLX4625q0bS=pTBq1MR;AaWUMx?;7?z+TmSS z=Zbva>9ks1&Ua&Hgu5hzS1nzXA3jhMC)_gJ#k=`YF2ynmB?7EdHsMZua)q6f`HKFf z)XXA=pW%&Exr89?7wJqIC3dCv~hC zjeXxQMhK-eA=>CM;na6KO0>LL0}B76rbj!ELI%_)MGWS6mvEv=#{EBNzIa`IAZNtK zTDo5-+bz6}X9B7<_FEE37CzFHRfDYI#Tq0Yp!sZGo*AbJb8+6%)oE1M(2Qkr%n|2} z!x`5&8}9m}9d!FG^Loh5fp?j^^@?oVj9-agi7dBuaOOXaykb#%_!V3AGgY(W&hPtg z1ngethRZ42R7EG6h?(NgwFK0XNr!L75bpAfoxa26!W|%baeRDoQfl8E$DnJ@&3zj1#{eP)Y-1Xz|-WP ze&@WqWkxz*xTt))*QN`F(%N1W>v~mrpEfGW$*gta$<7V zCSKKr9YqQioiKq6S)Wf_+k4Vw2)CPL0r)I5#W*g2HM?r_P&)-5-tRjCD@1KHKg-uk zpG_~~_ZvTLQ$%0K4vS5gXhtd3bq!p`80=YOBs2^ellT|k!wU9XO5_b`+F&ED_dK-z18KULcjom?=QZPC!H#1RH=f(L z__tI=PpUCAG4&2*f|-?U=sjR7)CvWn*yh7txpA=?e8*4E$UI1vgUz?Lc#mfm_-sX0 zjjp5Buv@JN+c+T)ZX_Jueb<%qJ3@6-atzIi%`+VF`l=@7Gw0^!mT45b?T8Tu3hIxG zdYuh5dYdC3aIv(hIkXToL%69J*u(qmVov-gTajH$|Jzr3>K`RdHulc1qT8Prg#Qxd z{~r$2hO2M;S{RH5uZA9Lj86!{Jk07LZ{J3d#|$mf#TPCX^Gu3wr)8K{w6DcdR zxoYJ#9$H!D^$YprX&G}|Yz7+iz}9n}913tR9@8y;p>`D%J*90%bs_@j#P{e8JtR8v zQvZTl=H`|e;kPA>_=f6d?V&FG0V)7eL__HygUXWFiJ2zs5mmjDiMMj1FsRC`(E)$i z6uCdL`dp+~)59Z_C5P6}bC`zHs}@pOr$7>dRl*W9$hI{%JhM&g+n@&WIP*{563*-? z&LzwMbih9!DFE^5885kxY(&_?K3a{>=f;fA)~qhq`_uc#$t&+Iv4|Na?h{jq{Ik&@ z%y@sJ;VE-Ml{RALW_EiP7*x&F3h_Sb{#Zr>fU-k^ykL{#0}pG1T2xEynnYw=WJ%ab z(aVC{-3@mrdUpx!I0P#^8l}?}Rb^?<5(lWw$js!qgk5J3H+~ua2Gu|r)*RYA=N!iB za|yjJ{J}WOXOwhj?b}bo>SBW`fc`178OQDYbs>|uF)?S4f;+UHPQYEZx1+F4^`A>) zm$+{lLcJp${Kn5IOmCjNP^oDdWg5aK=X63~vuV(4?v#o0$v*E`>9nx5qjk{^-eJpj zlT8lFWY=)LfPQOLF7u)n@qqP~iuRj+-pa@DL!3G)IAaDcLKM`V`nSRJ^GVIus9U#t zn?^FVR+>4bFFTVIPjE-sACwJVET;uT|J<~$xfMLiA+8vNV)z1nU$@fsY96dgBjGw+ zic3UdhwH$a&^Gl$(jmiob}XOQ+(bltpnDupEvE7Qy2%_a(&E06mt~I!K4ZHpBLE{F zRCfzXi6uT_3`ALX<&KEIkhQP5sYhMqdjq@`sky<B6p=jjYLQG9l)^3EE$w z16c4V)t`tVibmwN44*K^oqA-t*s)ruNP?o4`sTuI@KUOgelaj~CVG(Hel5pqPCtH9ixT04uiKy*x3w7az9}VXB#YJ!5sG8IZvC3hE3yw^{k5#+6` z^wp%a>P1iDRPeN;`&6~6`7)&{S1`P+ck+RHLsDO1XlQEj#y59p%(a<=*Ev51OFjXw zd)Wx083Wc1M5TLD1~}8c8vzXANQ(QBF5}E@RJ{s(xbg8ujzt!JH1?cjPY{5 z?2TffHTrvD>Ix$$f>(($Clukn5Stg1Mlx z?-~Ye$hzK@RF*8G!>lY}{o`+C}T>v{{9Xc+W7t2I9N*EPo zqqMH}|DhxiBp%%B%bLohIYlaS8oqyR)D^%+`_8W=9T{?9`H{V;imt-mm7EY%rrWxFLg*rVkKmYDI(+m-{1;5Qtng%?>g;PT^p68M zm9Fm+#a3^#^S&22L|G;BUw&G@(p-24gMF_Np3u^4G;o=_>DqZO1BLJU*Gw!$bTZ4F z%EN`gk7<5Eplhv1Os6}zcu!Pe{?A1S+3(F63}Enp9+p*FMuJ(SXeM9 zaR0(xx8WgB3vTe0={?2Tyxqu?bA0j>x;l72_l3qtc6R{br4br(1)p%u3Xi}7lM6)` z&)yV75GSor{=fE~zHXhg@DP{L0_c;eA2$jW&b_#f93Ji$1W`?oxSX%H+Wrz#|H|Uc z)rlI$iHZi-dqxUmm#fQea|)3}4OT5ZzA&r#j&6s8snAE-`*>ITuo^)NfEa%>2Cy-N zwM@WWy7Y5Byh@8QU^ry?y{5-YxpXDv=SrbFidRk7Eu}pfGPZo4 zVK;f1*<_{SSaE*yUAsXb=9PlSD|=_YQLNnMXW`6qn-nvv!lYs|7K+taBHBVnWv=sl@m?tGzmV%xBG?99Z&0wqJd zwPMVM0%i&~4W_Oy{=th&eQ!2`ye&pU9pXGpbhbShw0rf$Ns*sTtZX|HIq00T05h<0 zj8)%r?=XYgq+p)=1PK~KtjJhqo&9}eX^Ak^)6V&ky9oCmNG(QCAusJAi7H(h-(QXZ zAKT?9$;n5{JoNXk2p$qy#ynjyCHq8QTwbXN^%>k;^}}9>JQNcI<1lS*)=*!nt%FxG zSwSB|Wn;O=2(4aCmLQ00y4oQkcQ9-<(+Q9chhCTW|5$Gf%P&*4qQmoSdKd=*>Uv|b z31eZUB9^9|OV{6{8_x)mL4oi6PP+Q1e=<|9P>LC0iUUw8BV8oJ7ZIkNL)4?xE3Nu| zOpz7AjPm78?ism?-tj0JL3DLV>sE|zlKaEQoIjFD`o|U*u*{f-Gy{}MU!Y3tGk}Kw zD+OT2tW7t_JA-Wrp2wGOu7rC}4vAy?MXQvpi0|e;=i~1%I@9EkBGOXvM&ziGAM-fJ9zcj|(o0m)QPbiCzn#Q6Nb&P%wvlm5E&?+2D;gt?%eI8L$L;Zf_9J*cNlA%G&`RHaDhLDUPT zP(udX^>5|gzrU{-sHv&-G;y8IlHyV`H-CKl(ZmhwiBriFr&B4DT7y66+~0~2PvO() zVa|(cK*XQgfXdS#9I~r5ee-ghv9l^FCzEO6tXd?JrL$CJ~_wtp8iZcRX0~uM)IIQp>N7h2z1S|ET!F sJO1y}z<`=_87+U$=Ko{ Date: Fri, 26 Jul 2024 00:11:34 +0000 Subject: [PATCH 32/83] Corrected init and test Updated rapidsml version and use init_spark --- .../hf/test_HuggingFaceSentenceTransformer.py | 12 ++---------- tools/init_scripts/init-rapidsml-cuda-11.8.sh | 4 ++-- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index cfcff04527..e7c0421082 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -4,7 +4,7 @@ import os, json, subprocess, unittest from synapse.ml.hf import HuggingFaceSentenceEmbedder from synapse.ml.nn import KNN -from pyspark.sql import SparkSession +from synapse.ml.core.init_spark import * class HuggingFaceSentenceTransformerTest(unittest.TestCase): @@ -29,19 +29,11 @@ def __init__(self, *args, **kwargs): ) # construction of test dataframe - # Attempt to use the Spark session if already initialized - try: - # If 'spark' is not defined, this will raise a NameError - spark.sparkContext._jsc.sc() - except NameError: - # If 'spark' is not defined, initialize it - spark = SparkSession.builder.appName("Test App").getOrCreate() - # self.sentenceDataFrame = spark.createDataFrame( # [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], # ["id", "data"] # ) - self.sentenceDataFrame = spark.createDataFrame( + self.sentenceDataFrame = init_spark().createDataFrame( [(1, "desserts"), (2, "disgusting")], ["id", "data"] ).cache() diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index 1b76562c59..bcb8fdc93e 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -16,9 +16,9 @@ # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=23.10.0 +RAPIDS_VERSION=24.4.0 SPARK_RAPIDS_VERSION=23.10.0 -SPARK_RAPIDSML_VERSION=24.04 +SPARK_RAPIDSML_VERSION=24.6.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar From 959f04efe3fc00cf63730ad2e2fd20d2071cb244 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 26 Jul 2024 00:31:39 +0000 Subject: [PATCH 33/83] Style correction --- .../hf/test_HuggingFaceSentenceTransformer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index e7c0421082..3b46e94875 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -33,9 +33,11 @@ def __init__(self, *args, **kwargs): # [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], # ["id", "data"] # ) - self.sentenceDataFrame = init_spark().createDataFrame( - [(1, "desserts"), (2, "disgusting")], ["id", "data"] - ).cache() + self.sentenceDataFrame = ( + init_spark() + .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"]) + .cache() + ) def test_e5_Embedding(self): transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache() From 9fefefeed59414a1977f670836cf2b6b3d5bfbe7 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 26 Jul 2024 04:38:13 +0000 Subject: [PATCH 34/83] Updated notebook image link --- ...kstart - Custom Embeddings and Approximate KNN on GPU.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index 763bee340d..c41710dd76 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -468,8 +468,7 @@ "\n", "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", "\n", - "![Sample Image](/files/tables/comparison.png)\n", - "\n", + "\"KNN\n", "\n", "\n" ] From b39218bb03acd38d260cb9d1a76857dff4b6bf5f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 31 Jul 2024 17:42:05 +0000 Subject: [PATCH 35/83] Added sentence_transformers for testing --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 5302760212..477edb9c84 100644 --- a/environment.yml +++ b/environment.yml @@ -51,3 +51,4 @@ dependencies: - markdownify - traitlets - opencv-python + - sentence_transformers~=2.2.2 From f058cb238ab571d5b4fb1285ce914367f2a6f66b Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 31 Jul 2024 21:42:36 +0000 Subject: [PATCH 36/83] trying to fix testing --- .../hf/test_HuggingFaceSentenceTransformer.py | 11 ++++++++++- ...Custom Embeddings and Approximate KNN on GPU.ipynb | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 3b46e94875..c12899a027 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -39,20 +39,29 @@ def __init__(self, *args, **kwargs): .cache() ) + def test_e5_Embedding(self): + self._assert_input(self.sentenceDataFrame) transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache() + self._assert_input(self.transformed) self._assert_embedding_df_size(self.sentenceDataFrame, transformed) self._assert_embedding_embedding_size(transformed, self.e5Size) def test_miniLM_Embedding(self): + self._assert_input(self.sentenceDataFrame) transformed = self.miniLMTransformer.transform(self.sentenceDataFrame).cache() + self._assert_input(self.transformed) self._assert_embedding_df_size(self.sentenceDataFrame, transformed) self._assert_embedding_embedding_size(transformed, self.miniLMSize) + def _assert_input(self, input): + # Use assert to check if the result is a DataFrame + testDf = self.sentenceDataFrame + assert isinstance(testDf, pyspark.sql.DataFrame), "The input is not a DataFrame." + def _assert_embedding_embedding_size(self, transformed, expected_size): # Debugging to check the type collected_data = transformed.collect() - for row in collected_data: embeddings_array = row["embeddings"] size = len(embeddings_array) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index c41710dd76..124493e77e 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -468,7 +468,7 @@ "\n", "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", "\n", - "\"KNN\n", + "![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png)\n", "\n", "\n" ] From ab0895ba94b06c35b3a137bd948952647a0cbe1e Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 31 Jul 2024 21:55:02 +0000 Subject: [PATCH 37/83] style change --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index c12899a027..cbd88bc348 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -57,7 +57,9 @@ def test_miniLM_Embedding(self): def _assert_input(self, input): # Use assert to check if the result is a DataFrame testDf = self.sentenceDataFrame - assert isinstance(testDf, pyspark.sql.DataFrame), "The input is not a DataFrame." + assert isinstance( + testDf, pyspark.sql.DataFrame + ), "The input is not a DataFrame." def _assert_embedding_embedding_size(self, transformed, expected_size): # Debugging to check the type From 9c096ed561e2c03a8748d4f1c7df9f846c32f9c8 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 31 Jul 2024 22:17:12 +0000 Subject: [PATCH 38/83] removed style spaces --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index cbd88bc348..ce8945c261 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -59,8 +59,8 @@ def _assert_input(self, input): testDf = self.sentenceDataFrame assert isinstance( testDf, pyspark.sql.DataFrame - ), "The input is not a DataFrame." - + ), "The input is not a DataFrame." + def _assert_embedding_embedding_size(self, transformed, expected_size): # Debugging to check the type collected_data = transformed.collect() From 62d10fcda41cff0edc338dacc55811ec5713e639 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Wed, 31 Jul 2024 22:26:16 +0000 Subject: [PATCH 39/83] Style again... --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index ce8945c261..54818421c4 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -39,7 +39,6 @@ def __init__(self, *args, **kwargs): .cache() ) - def test_e5_Embedding(self): self._assert_input(self.sentenceDataFrame) transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache() @@ -60,7 +59,7 @@ def _assert_input(self, input): assert isinstance( testDf, pyspark.sql.DataFrame ), "The input is not a DataFrame." - + def _assert_embedding_embedding_size(self, transformed, expected_size): # Debugging to check the type collected_data = transformed.collect() From 538c05cf6fb43ef34da3c3f0b5fc283a286017a4 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 00:23:03 +0000 Subject: [PATCH 40/83] added pyspark --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 54818421c4..87ba7084d5 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -5,6 +5,7 @@ from synapse.ml.hf import HuggingFaceSentenceEmbedder from synapse.ml.nn import KNN from synapse.ml.core.init_spark import * +from pyspark.sql import DataFrame class HuggingFaceSentenceTransformerTest(unittest.TestCase): From 40062661bdf70f5a6101892a44f519408bcbc83f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 01:35:04 +0000 Subject: [PATCH 41/83] remove pyspark --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 87ba7084d5..8c35ac88fb 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -58,7 +58,7 @@ def _assert_input(self, input): # Use assert to check if the result is a DataFrame testDf = self.sentenceDataFrame assert isinstance( - testDf, pyspark.sql.DataFrame + testDf, DataFrame ), "The input is not a DataFrame." def _assert_embedding_embedding_size(self, transformed, expected_size): From 63ef878c3dd21e0dc65a0ca5a76004293906d753 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 05:10:18 +0000 Subject: [PATCH 42/83] corrected utest --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 8c35ac88fb..2814143127 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -43,14 +43,14 @@ def __init__(self, *args, **kwargs): def test_e5_Embedding(self): self._assert_input(self.sentenceDataFrame) transformed = self.e5Transformer.transform(self.sentenceDataFrame).cache() - self._assert_input(self.transformed) + self._assert_input(transformed) self._assert_embedding_df_size(self.sentenceDataFrame, transformed) self._assert_embedding_embedding_size(transformed, self.e5Size) def test_miniLM_Embedding(self): self._assert_input(self.sentenceDataFrame) transformed = self.miniLMTransformer.transform(self.sentenceDataFrame).cache() - self._assert_input(self.transformed) + self._assert_input(transformed) self._assert_embedding_df_size(self.sentenceDataFrame, transformed) self._assert_embedding_embedding_size(transformed, self.miniLMSize) From 72b45954e4df85adba98f35ccf42d3b1c837f9b4 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 05:24:21 +0000 Subject: [PATCH 43/83] Reverse style change --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 2814143127..95239ed385 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -57,9 +57,7 @@ def test_miniLM_Embedding(self): def _assert_input(self, input): # Use assert to check if the result is a DataFrame testDf = self.sentenceDataFrame - assert isinstance( - testDf, DataFrame - ), "The input is not a DataFrame." + assert isinstance(testDf, DataFrame), "The input is not a DataFrame." def _assert_embedding_embedding_size(self, transformed, expected_size): # Debugging to check the type From 575f0209009d733aa688b864b1d2621bfcd43fbf Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 06:46:03 +0000 Subject: [PATCH 44/83] change data size --- .../hf/test_HuggingFaceSentenceTransformer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index 95239ed385..ed148ba065 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -29,16 +29,16 @@ def __init__(self, *args, **kwargs): runtime="cpu", ) - # construction of test dataframe - # self.sentenceDataFrame = spark.createDataFrame( - # [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], - # ["id", "data"] - # ) - self.sentenceDataFrame = ( - init_spark() - .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"]) - .cache() + construction of test dataframe + self.sentenceDataFrame = spark.createDataFrame( + [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], + ["id", "data"] ) + # self.sentenceDataFrame = ( + # init_spark() + # .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"]) + # .cache() + # ) def test_e5_Embedding(self): self._assert_input(self.sentenceDataFrame) From fc66d0d43eb74c73c69f617f75c5a9aad29e1009 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 06:57:09 +0000 Subject: [PATCH 45/83] comment a line --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index ed148ba065..b4ce595f5b 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -29,7 +29,7 @@ def __init__(self, *args, **kwargs): runtime="cpu", ) - construction of test dataframe + # construction of test dataframe self.sentenceDataFrame = spark.createDataFrame( [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], ["id", "data"] From e8d308d7e2572d3f366c1087e5562b9f1367bf9f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 17:01:53 +0000 Subject: [PATCH 46/83] Corrected init_spark() --- .../hf/test_HuggingFaceSentenceTransformer.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index b4ce595f5b..a3b37e0fdb 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -7,6 +7,8 @@ from synapse.ml.core.init_spark import * from pyspark.sql import DataFrame +spark = init_spark() +sc = SQLContext(spark.sparkContext) class HuggingFaceSentenceTransformerTest(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -30,10 +32,21 @@ def __init__(self, *args, **kwargs): ) # construction of test dataframe - self.sentenceDataFrame = spark.createDataFrame( - [(1,"Happy"), (2,"Good"), (3,"Delicious"), (4,"Like it"),(5,"OK"), (6,"Disgusting"), (7,"Bad"), (8,"Don't like it"), (9,"Tastless"), (10,"Poor quality" )], - ["id", "data"] - ) + self.sentenceDataFrame = sc.createDataFrame( + [ + (1, "Happy"), + (2, "Good"), + (3, "Delicious"), + (4, "Like it"), + (5, "OK"), + (6, "Disgusting"), + (7, "Bad"), + (8, "Don't like it"), + (9, "Tastless"), + (10, "Poor quality"), + ], + ["id", "data"], + ) # self.sentenceDataFrame = ( # init_spark() # .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"]) From 07a67d3511307aedfe8c309396f298dfe4aa7d22 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 18:35:36 +0000 Subject: [PATCH 47/83] Style and added SQLContext --- .../synapsemltest/hf/test_HuggingFaceSentenceTransformer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py index a3b37e0fdb..00208dae03 100644 --- a/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py +++ b/deep-learning/src/test/python/synapsemltest/hf/test_HuggingFaceSentenceTransformer.py @@ -5,11 +5,12 @@ from synapse.ml.hf import HuggingFaceSentenceEmbedder from synapse.ml.nn import KNN from synapse.ml.core.init_spark import * -from pyspark.sql import DataFrame +from pyspark.sql import DataFrame, SQLContext spark = init_spark() sc = SQLContext(spark.sparkContext) + class HuggingFaceSentenceTransformerTest(unittest.TestCase): def __init__(self, *args, **kwargs): super(HuggingFaceSentenceTransformerTest, self).__init__(*args, **kwargs) @@ -46,7 +47,7 @@ def __init__(self, *args, **kwargs): (10, "Poor quality"), ], ["id", "data"], - ) + ) # self.sentenceDataFrame = ( # init_spark() # .createDataFrame([(1, "desserts"), (2, "disgusting")], ["id", "data"]) From 2599ae0e0e2b1d1713c479513d0e3c7b6e03d5c5 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Thu, 1 Aug 2024 20:48:33 +0000 Subject: [PATCH 48/83] Corrected result_df and remove old image --- ... Embeddings and Approximate KNN on GPU.ipynb | 6 +++--- tools/images/comparison.png | Bin 57222 -> 0 bytes 2 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 tools/images/comparison.png diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index 124493e77e..e979110a30 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -249,7 +249,7 @@ "ids = [1, 2]\n", "\n", "# Create DataFrame directly from the data and schema\n", - "qDf = spark.createDataFrame(\n", + "query_df = spark.createDataFrame(\n", " list(zip(ids, queries)),\n", " StructType(\n", " [\n", @@ -259,7 +259,7 @@ " ),\n", ")\n", "\n", - "query_embeddings = embedder.transform(qDf).select(\"id\", \"embeddings\").cache()" + "query_embeddings = embedder.transform(query_df).select(\"id\", \"embeddings\").cache()" ] }, { @@ -438,7 +438,7 @@ " )\n", "else:\n", " knn_df = knn_df.withColumnRenamed(\"data\", \"original_data\")\n", - " df_result = (\n", + " result_df = (\n", " knn_df.withColumn(\"match\", F.explode(\"output\"))\n", " .join(df, df[\"id\"] == F.col(\"match.value\"))\n", " .select(\"original_data\", F.col(\"data\"), \"match.distance\")\n", diff --git a/tools/images/comparison.png b/tools/images/comparison.png deleted file mode 100644 index 1396ff5b1460d8ec7cce92072b51e633e0390be0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 57222 zcmeFZXIPV2_bwiFMo?6&fTCb0N>h|3O+iFNnj;`BQ529~C5F(HrlO(}K)Q&4bTt$M zgs2D%p(zMR3sOQ4NN5HKoV}mmyz~CchjU%O^X+iW$jbBVy7t;@-TPiU+&9qE+PHq# zdK3z^QTzPaODGf{8inHN;#~ut2(j^M;MXdTOIqrv-1^7|baZrfc5=%Dp{uJ)=92eGBVSn~f7vT{<;;TQ&4LsxLKQJ#r!Wtcu0B+{ z`bgO-Tt(kqStZijN zNH;Y#HS^E5xa?wy@xx#+*Q_5{oinqtva->$zIpSeedue4TQ3|P9i7fza>hM%yI|(# z=H}()bvv;Je?|xYD4lT5fIuMl$Q<)|mg9d?CII|A&ku;nzax7h2>cfm6s)EmA}6NJ z8NorBnVH%5LUT&%^78WDoY#EwFznsu@1zUoNF-80fPXC>n3FZ7CxjEaV~%FK+)%*@K#ZoG&DRs zJTfveIyyQwHpXN!$H&JfCMG5)Cs{1k&^^8aV4C8dm-PNZ@opB{i$11s`n8cTe*#Ci0_DZ}#~$xGdCBP#-FPsrWDF>3l4S2jw`o=g;%_ zkNz^z^XP6^g}NH#SMa(mkIE`qvRU8BPn{}SjJ-_B;Y5jMsebNOR{dIqLODKYk`~RX z=lH4>Up|_CQFZa?vsI|?$Efp6m&msZ3;qtHc8u<6LfG<8(IpnHDRea|;1P>svgB=8 zN}ZU{S=Cvax3H(ef11aSVbY|v3KdW{m2;8N&6#_}>-?i=yL8V2&Ofpm^iy~=a`_|i zqx`&61yxlhb^{ptLNiwJ?~XZ~Z0zNwf{ObQ%TK8hOFAgj_nX~e(K8%+_Y&PWx%sNeD;=`*LEUv%l(6r5DXgg>V|R5Wg` z{y@dTq+&Plo#vxsKgvV9BYWScDID*Np4JEJ$(PWtz1VR{M1OH!bopn`U-LX*IU^s; z4V5h3E$g1_DOc+af&(hj7gb&So~Ipb!LGwAC8bjTLH$sn@z%vMe$lctDp(3hFHj+LcppQQ^RuUW zG|~r!GAK)Qaett|LtKU0tGMv9r^?@QJ%if1${K|_uL>q>{ug@rF_`C}oYklgQ41Yl zJXe~lspC~@KCU!A6bj85!GamcMu+d=xH_UKlk(r>me)FAf5gUEP$t0fqA5h4nSiYfv9{tAjBPY^I@5 z$3hL(q5>3yz*^MqBd+>krnLY9Mc!OP6IV&$sdJvVxi)%LevMhqa?9v(`-_QL)^0vr$fYR8|f1b z=daz8a>FhDdoT>$$VXNBP+kYIJ3uy6o zxF{ARnjYChxfs3obHwg9MnN(gn3~?7<0k0EiCxUvp}g({PeYH^(XO1x6`N zqf}W;sy;P0qI`$m;U>}W#`VWGByZ3lJQJA|AX5_mGPq?U(K_)WKFKe7)_y#VFV?(o z3!^pSWbughynM%1PEqBJd4`_ZmC$Mz-TKWufh#j%XDI3%*CAv_ki5D^ZO6uMAepgrD8Pc-%ASppSRI%#% z?pBvrHIq+$jxquzg@RyAaU~y4g*$C>WP*{-_8FU92@f5Hig|NEgJi7dgKu|xM{c&tTCwmUn!m4!RtPW+3dVai6B&G7z?^0Q<%jW}Q z2Rts(*9Xr$Mz@9%DUW0~AoW6SaB?=f7ioF_xVgJk+7;c}UEn=9!ahUqZt&XFDCZkT zw{BILJeIJ^v|IR(PXcfu9n{?m4-SedoDx6hH#du^CuBXutBzAUI|w9y>w~|#Iu=t( z>C_D4@J#)pIWMZ*D%YGHc~}Q6mj_`u_I$$F1QjiXTME7)PBLD{n8ij!a9S&?NSPn4 zrgrD3S^7^e_P#P#3Lh{wVKAH87hYQ95=)&M^=d4rt&CK<@2}oVd-f=cx8jnTPuv(d z>US|I*Y6WKqE&xH%bG-Juqp0imM^L$m7QSa^rkRG^p76)ZQ*GcY{o|9Ri_H~F#9!c z7bSHn51rB_3Ans{`#IHYBzJPwZSuGXCcX;4JbA<2>nmp{L^#EhDOpT6zdm0~!!OUy zlvmF*c!`w8FLZpW>m~`()k>0`4iyci=Lncb=ln7oF6t8dnxzXZGcU z$t1HM+nKW@hxPVkZxAMT*_p@Ju}2##l(vheau!F@A~VimBie*}EL9lq#;P}JT}bEI zG{xWb^YzHqXnS%=W&d^J7Q%3BS5lJ;Se{e|m#;JQT}=&%mXtwiF>5%_@BP|tQuWf9 zgTj<_Q|iRBou=@mkRIUi+6(sKRej4l#jCc>4eX#(+K4gAE>xVK-=&3%kt4y^c$&02 zX*pz`nzSm5QRnVodnuo1;`DCR)}Oh!OROf%gygpan^xDUH&f(dt8vpRz4nzlXKU>R zoXPp6iC=_eF=Uh?1H^J$DfK+8_HCNXu#3xB9|-}5bxh3o1u z@pXiQb#-YMbJnZY58vy(+R~8~-sN!tKS@A$Uv|;4pjvC-2NpLouq`L1o`uz@I3?9I z4qka~qf0$ppmSU1gqH!itj=EEgL!OT7K}@$8;q;3xzoXG*m3tp=ZB32MMK^$k18fS zRVN&t$HkQ0oK-RZrLb)Bpp==O7F`iTyoT6-#`%^_8P){`r^t(+$3EAljOVW(P<=A7 zJAs-=cqq;p6s*tP?Uqfx=IV8SwCDz!8b>}_(Y>dx#9grQX=RMy_WJj##5jLl=l5H3 zC*p?>jzr4ErIU1;hjNdzh4NJENBi3%JWrC3Uuf-m=};z{mP#1L_{&fuNb`a(dgdI* zeVN92akblaw2Mq+7vo)1${|Lg=!vh1)t zoj02QN9#BK_wowlGttrZcjhho2Eq+Ts}IYc8!k5Yq!d+7GtWs;-p%gK5%2<`L9D9P zA-Qv!;5A@A@NAm@f~}oo>yQNDv>*&$X-|A&mt^GeCv#K|QKnvwrKil7|D3+xY3*=x zAbGB@R%1S)i$Vz{>E~npsv933%I!W$Xb)m#vwn2+uVId*_ zUpWFU>2KvZ59mj{_ymV#knu`kKbePPOXF2ye%6NN+kC4hD=ic}cZk%vHP|-Vf_o&6 z%cF2O9KjQ|+JwXZvV5#WmSX&fYHLe>FnNaUQ2n*T zb7KvadW_=ojcR@evyM9#!d0q*lw5*dwpzM{d4jTXgYn0D3|r6y^Vze=TZ9~yJ##cm zEobrS{0D4FXIo0XT*KB@msA74+404(y1p*f2aMa}{)BWA&$7MN8wW#=I;E**)iY*x zY+17Bx+X>jwUySpHg}J1DLdhT32-f4vQoYe2b%|#vv4!hSoyN!xh0yd1c!^u%@x{( z5!yc+%C`UPoBrWBHPJS|iZOip*TViVTTHDjtR{XW2U^pnI!#j6}RI@woQB1g-gG_|m<>kFig0Zrc zgR&R%k`e)>YX-#d6VEId6jO{&>!#%$`!_1@go-0$l!gdEM)ieaaU! ziD~;T5gt=4(M1cfwVk|8Ytn5C;!Afw*;a6%G)!#uGw$FtMN?G62~6#S1KmQc^p_44 z*|a%E`xg&q@2Yr}0VP?=LwVnZ7R<5R!}%_@3#3-(SY~VK(&nKkFt`HV>S=y@8b$m7 zznv_9e7-^6-Rd*t7M$yPn|2M@yR18InxnGKXV}%0S#Z8}gLWa+uO^h0vo^s(Y3cUY z!(^}I&T)r#KhC}OFAz*D+{^LpsWcexzf?eWsP%amlVu>Blk59*@Hy_f1qd25EzjxE z(p@@@_PI;@9Af<&T5{`ZS242E=Gxl&I!=d!jh~}=KJCE8CZ4}_!e2!MT1AM3I~X1F z5TFky7b?xHu_!oxuX^%hyHaAE>N-C^Vw6Yg?nZlmP82_5G#qodX@91Qezz2{!&g_J zi?<-FNgIvx$S~`a9TB}gs~FtAw|r)bH0Asd1l4dUsZ<*el{~EoC;f0ORlvkI`JGGACEM*i{QrYWZEdEWioHr6LUkM_W znaiVL+hWg;Bevh)FWcA4mrqkmhY4QAu1z}AY1k5-)<1^AS4@Xz_N0*Fo@$>0wd?aj z1UCtL!yOmZhQ|&pO>1C3tGngi?Wg%vly8(XUAtr;^|h2m`dR?TL1`5V4ZHBnxJ7(RxM{HX=Cs&6(Kd$Cv4z_JNP3jO3Fo2zM-1dUtgRg{j$( zk6>Gj=K&m8?x&HJ?W=O7I};Oqa@li^%-*ic)B=2X+|rRRJHTPFJHw5VEwE?1 zb7cI+=Ev2RJ4>@En@!ISTA67 zW@=Za$}a4wxt3bDcNJ5_oIGrFa;bj-#y)D}e`ZPdbe>ep^z_V_=VPSFSI!jOoR70G z+xZiK!ghCAVgtYZbs0gaP0FpR!~2J!cFls^%~pAfaWMw-w&^MJkvjh3Ciwx&-%AF_84{y&cz3gNROeoGE_s^w@J3lMs zD|^FZOXj2ZSWb8)`!LRW2AeeTyY17(ep4M@E!a&e5ccV*IB!pV_-e-s9E73(`y~?e z^9C|i5*9sO5?kd>`Y$E>SeL&1`ZW(IcFE0F|5Jw?vBBo?WLI04uYnIfP3*?dUSgb& zt^nrKT07$sn*7EiD>ijMxaq=_@byWpI!jK?<6K;X`E_q>nY(mGX=Z{z9tfD4SP&CN zbL$Q@nbVY}J_0sn19SdKHg!Gv(a)B4yLtQMKe-C`Efk-4t;9h?|61lZQyL=~P?&7t zf2TR6`fKH|7FLdE?x$<4O=_pcczp0Y_j?y{rst9p@*5W`-wy~_PURe@NMqw!3l9Rt zNNWx^Yi3#^`&iqT(QYB5VlPd+c5DTWZ+fqX$s~GtL~IGuG|o$ICuCdVQrY27e;B{RhovF4t20Ml8Bw7wc{ z%bI^XD&m^zVJ(T;#kw(jyt&!R}}1HD|2+OJ=1Q|nvQ#{c*{_ush_-+rzt|n zBEprKhjnsh_qyIlwMMtD&-9b6;}PmO zIQk;QKJ=vYpJ#JIa`8o7z-jnp*y{Zb{3FIs8hxotHIpGGrKMhAp3_fyQ}O8S&rzf2Q_+3wb*qM9$dLu|tICHwQI&CP$B zHhC>vAG+F83lbn9{Z;KTYur8f-(&pH6i34y`LC1P4aw#wT=0g3wy(GJw-!4IA;ZT*#L9X=u^sRoGT3#&2u;G!85D zLJ1gY;ogJYtu1^7rLWb5{mUJO1dYyw?aK5ooR{cd7^Wtu=Ba2cjN<$-4&`CjU)vU> z1d_e9h|(AXB1~IsJ9?lo}T+FKVJzQ@LwfJtLd2Jhqe&C;m=%qwca3C%}f|)Z%6HxX<$4 z=b0>O&MmLJhruQVU(hGlF^0RxE_S?96|gKBFBo}mWI2NqoH;(REq^{Xn`kpQs1=<% zpHr1rBVFCLM6Vd$(YRx{|B|FAtu1iGy&0RVsQA4NX!O_2(b|QY9_L5HE)FL&M8uTZ z5@JXqRGsCc%e8i?AhK{`BC6M^(@oY?H~AYaH}}PQ)T8!n1DIJy%XqNl#MHxuTGHfP zx>mwtjM@=VX1e}e>CC%wvL~n7oYnB=C#&MBW-7#O8JXt2^L0dh=c3uw>94fWt%nuH z)587bPE4AYjN95}ydjko%?-`=YuZk!p2^CqzF-%X#hK%KCEi$_YPOF!Wj@kI?~BQ$ z`cCp0fbHAUnssbKIuYW`8(xid*@u#4-j4{O-zOBO)8KU7N(>ES^e9HD5^f zshC!9;5xLfqDs$z+XLwFimKA(8xzc>4$6GTl&+1;E&x`scW;}j5YrDRl5BUcnm-7F z*+zBNIAiB$Qq2TWDXY(*tcS$3P}@hYol%m;B+E7Qzp+R$$W`f5G2s+9cwIc=`Phi<$$58+_vln9DWmjc3#FDet{0!nF(lxBcs>Uhsr=+uVgCk9OhHCQC~MlJ zq`d}xClUJ7x_#{aku4p8^V`TKp1W!CK83<`d&i7X`Gx7J&k@x;4%=zAGezg)QvA9G zcYiJeaM(xJ_yVc1Z6o$`b^7R!OSg8?l$y2#3dmQ=77eunAUc?_oKk@eZReYd^=oKJ z50}pJPrsqIMOgi#727wBAvM62cNm&(!=$o9*K+Q4Y@@VIQFcD+xy5AXs| zraAQ(a;dVSBO08{V|JrbG0u7WDeWVC2CVBGmm4ouV+6^#|%i> zzy9g=)ix{XgMAqzw>#c0FhQ^&)=7YNLhNYC&!MqivwDlU>XEbF?!I5eZZ682O4dD- zH<6-$t|dpBlwH;dVU@N;>NLonD28gNPx^kI2l9E|mC5Wv)_%=={d4c+!u z>hhGi#ohRXA7x|p5pAIjJ8ks^l_LpRa_`>1Uf&+VVpn%Icsb;$W+-+)Yn>Ql?V~MJ zP@RNngL%@T6&=-m4EwkpQt2%fY(GK2yJ-L|ioP(6)h&%(nkwdre*AZyH}sg|*pU*G z!>f1Eky9K%i`!0}sB-IWh!ZCh7}Zhdx=E=xk63S}s3$3|SyhK%>|5QtngS94b!8Hb&BDzwO7Gu; zq~@qjw_Lc5|HNyGbhWOUSiq)r)>i^?!;ci63Gn+&V3cE($8AgWF&M>rxWZ*0X&ggZ zwd_dL9NohCE>5~;h8^A0HKo12%lBt*Y)V(O1c5_W&9Lg3O{soMo?|cw<}Ob(O(s8m z4NEU+799Rq^G!D|Z21SfS=@D6##a$H%5-On-DJs{{#r$+9&HiyDH!xFDk?GeH5_NE zdXwJ_8Ivzdj_-;NuNP6)y*2l<`7XKb95}0Cf69XCtQn7P5b^st;x>;dEn4t&nLOY4 z2;rHj_a>C|G1xMLCTULX2Hr0xk~dtR&aM7y&Qh_R>is-dYoeI#I6vy}YlL&D zPWZY)8}`fJvK+b)1v4@1Ff`&)p>pfnkCAAw`Sj@f+*>>kF0vLDSNU$Gp-z%q>9|1M z@U`;(A}@HFwmh-9u5?-+q~+=yucP(#EfnTTL)Iv2NAJrDA6>)Q1}i*DR;JXGZ2#QZ zGJTm4t#^iJS_i%6ORZc8%X8K$!Dafmieh{}E zvGy==)lD054Ys5E-9E*1yvX0vOsP=i?u~oFb9?Z7(XNkg$H4}-v(|WL1FR(NEJQG^IlyUAC7iM>1^?5tA;ftsc)^x!5 zBCc+<>&Aul64u8r$fWGhNowa!P7$68gvherO`P@V^Zf3pzB9l^HSKE6J99g0r7`Tq zBYA^$?8935NvVHc1(-GA@iPhRpJ6tfvg> z!{59NjBnm4_DSYy)UjRaWc^Ah6zZG~dhYM0qCx{9FDM;VrUaL^Y z?A6I)p^TpXhEqFHS6>CWOFy(kp&lj}j0e8uLj`=@MpOB9Swv3`7PV0)d)STj>VTB| zRbr5PYuPSvJLdZz>g4ON8@WJ(aU!gvyN_6M`H|(~4_7dvU+Wl~_C&`xj;%&r-NwtP z7lKs|s*}Z$Dh~|C@4+f{`-vq_E2ZER(l*m9kAdFaxCOcMWWEj4NZ5dSb0&y20>|aJ zH@Woab#PNbSR0);^nR>=4Sahoh-HtwMJJa&_zS*{yMWI70H-{#l~!{Cw)QcI^{&h$ z!6z1c_%5N2*GBq}Gr&BC{cjW_MuYy#Qb8Yb;Rcwy$e_fC9ZC}XV*V}QZb{rlbdfcj zgg7rF3h9g|xwAf<<@yd5eFTc0zb=Sv1bT?mK^OgOhMl);rq%6)^WCpbJ_3gYGLG{9 zXeJ80-Myk22VTZu(CF#Y)L;6KD*caG-Z%$}UIqR2vW5K@ZO`?I`R}*92_AOWf|fr| z=^Pm8Px=+;f6USzK14sV9J+zx?H9Zh`1pfpwUQ}1{Fy=79WbzQ| ztV*d(w`5R%$bdm7D;uk`&43p6CSoscJhfUa;S{%42vW=Ex?r$p&(0mmrQhA!s(FBM ztlru4U2~tMH|S6L1{~SWKKR)Oes;A-9x&hoKk%Tz7Wv%ty+ZUtaQrKW53hL{WH>?P z>w8Mjud?80#g*j<;6110Wq7{z4lqd*pss-1l_$L>L;u>-`pV%WC?}58qbelomC1l| zCV`+_^okkZyZQ}|Q$=V;vT_Zg7?VmB*jNxKbYfFYd36#p-`Tfe>cvMNJ}}6IHoxl8 zkSj}Ha^yy`@&$D~4XGzzsRrwXfO^}KXyw&VG0${`;9$$aAPzpV^wBOso&H1kI4l+l zibdwZGV#ZOrQbN(`)8x9)#V`fYS`F`mv6(8$g{5QQkI8+G6s>ARg2!``A4oOXmIp1 zXfW(;nCNc3wE?H)ZjQZtlDi{iKd~Xa<|5o@PaPYf9wpCNk7~4m_WqdW%Q&O;V*t(p z^N>~;3Qg{w?d9u&kKmFuMsXhuM$0+Ergo*g>q?GCHq#$6U?ggd`4Pb^w37!~7%ErJ z6&Bi)@~$G;;*sU<5nwhqwpB^3A09b=v_qKoFN5exL+UxK)boV(_NBZ_P4=c25Aam4Qh}$yMR`PbL)2poN*N)hgmc3ys zVkz&Ok{2l>{Z09NsIza{;JRw=jFjZF20LeS!}Ch1CD--cg^fnmmkY&-kDx6Sbm5%g2HtzuzgKk6qva2c2@*styD)XG4qcY3s`}KPXX4fkx62&u*hc&9^whV| zf(j&&30Q3f;vq!53&lOjiy6xC$kad~*~@6^!gIM1V++TlE9S6QJ?2d@KclFKH0Il$ zJEm3^$rx8v)8-4_A!z;!*%P3d=38h{74%ClHD+Yu4)N5IrT~oF+dtu~Vhj#vKh|Jd zHo(X~2=Q8f8c`xnC79Te;_Lo?u+1d<{rB^` zO1@NcXOgHvR=w`yx0Ks%?)G*a=T+?S1K)lvjG2#Inq?mO(wd14jR71EWbJG-B1&gF!Oa zW_Hu^@`&SEk%FrNHtr{`%NjJ0^0Lv9U(VX`uP1uuhNYI+)b9i;Yd;J$NWF?PKU(b; z78iBPaQXj!Y(C0Nt(%$>G%fsVk=zT)gBxboX7 zfkEOaArtUl=;Cs8V%UV&1CAI$e`3I&t|s1UZ21RDE(FY4N`tHrWf6h7Jy(3bi$=0)G*mbx=RHCf8mlOK%?uG!XfKgR8p*JP2aF_mxy)|JaOr zKZ4cXa%!!3kb7~V=a^0s*!Bx1V1tg&*!&Uj)xCx9(^i^_4mUa<16v@+9(B z2Gqk?gToTZomqV{5x}E-PzO2>a!-P$tP2))9b*Z=m4LW))}RFnj2s(*-nP@hdZK# zW)N(p0cPZm2UBeVEA2um?SPexkV-75wGpI3D!`^{5X5EygE~ZvzQuJvS02H!mF|Ya z+=x^^0;<0_fK-=(LDB@Ne(2R(KA^-bL|x_(z3oB9-f>Dw32_c$NHK`QPNyTHF%+v1 zQ4T;x2nrLBB`}9GPe2+HR+0sB7b3^W0MQp2oG=uDsey3{voW_=b^zQK181RwmcItp zY@RE+Ug1Ju6{MgGksE1V=paMag}o;sy$ge&3=zjPvZ7^CVCjZ{lb1;L*#l=S4`DyHJ` zqL9{*6*-NF$cmmv#6z%v0U~Cf^hyR>2u7|T_XIFNA;c6cJU~!S200{uDR(COUciBLQ&xCtrf1q&(?v)+cmBTCD8^tP7S z5G0%dCAG95BJUX>7KX4^@c0#|=#HJV70tGs18tW#n=43 z0s^f7(Ei5Fw2AQ>spbS*yF)nEYqQ$%G97X9K$sJBQ6E5o84;Ro!==FiFR|3cB`-4? z<${BxfVl#+`lnzVBPsx4+#yV`mzJgo?8=Dz+n2vhaBLT(MF;%H$BHZ+KLQK;d>9E^ zs^6>Mg^E?QXoG8&H5F4Yy}OC#E>k%KhH+TM0JL^jFy{G93}sJTW7aP{XX~~U_fcPZ zX?RV})D*>M!RhT+pSN!(jKq@C03H(o%JdC&7pc;38em5#$JQKSqzej@QY zpa(z)_d4v|_ZKBi5N&hcM_ZRLE?kq>y?heuIP``K!_>j{ZT3J@YdUVbukL>ti%nC* zUe+P^Hv6$f>jdL~U+xA+u+0%*H4`=P?wiTPiS2`wU(c6gFwN0~mK;#7R9+^{*Qh_Gy0lBk{bU#%Uh_c# zm=;*ahHiTOLZRy*gCjwR%GSr{A^ZLU;QB(llPf>G7D0?$)xV@2#fFuot=4#ZN00Wytz9N?d zXloQecz<)xz-9*)KqRA>MTRiSwHd4X4?@ar-C~{GS+cS_uZEB?n*ky+mGyG_GLff* zI);I>a;G+a4+fs+PvBVAF;r8zZuvbIvpfYm=>1a|13Ht+xeUmAF76NnZuLtL@@5+p zfiZ$Y;rc85k~0$G=AfrJ$pue8vOrrm!SMwQArXpth-d~v+%vCL0S*X$*#meBKUamF zs8}SZm#pB4m1_UONAQSpQx|Cg(mFTw5%D5&0QmxtG8oGiFswJ*fz7Ftg@Jd45ibgv zqaO65hq(PAcnLuYCb*Te?jc}^TWR7Djff*4z{C@t?~IZ7msbg9lntT`h}ri+$5#;L zl-^y@@e38$A07|m+%2eGRwHzxJV?KXR&;v>(cpifqZLf|Mh2pj52VB1cWD|PvI`&tBMDk2)Zk6?)x9>|>P;GF!CISB&*7z%CJ zTL#w{+-?FZ;gi$%E`lF0e$xozErSa-1dh7^A%2HIn2T;oky9rEcS{k+4O}F^Jl(&6 z@HRv>QeaV!LBFMr7l9(OzU<5u5Uhfr$2_>GYYWSm-BG2p(MDtVQPhFKl#ZMX#rkGL5a75qp4bB_jrf6qABA z{)362!_NkR3TR>l9`l0AUg2sMy~fr_(pHX z(h34@0jt$A8;qbV06JS*5Rn(GR7+H_JJ_62U~__tF+l~E2wUe0f) z%VBv_bqf{f3csgc8``Dh;5qIt1R*$d(l?fue2ue-`5bAirOJJ@U$4w`kF9gr_vgTR z?*Tssf=y{sD61mtCx+3XRZ)4SY%7{Ln(hXu z2-TF*6N;vqZcAb56_r)WiP%V;;6XNKNo$JB(xAcWfSB5lG;nL-y-=9~A>O)(Rvk9S zzBIF%qJG>vURoaO2>0wmXfAXO_A7-uEDml^6gQ=p8*=P{DZ_3_fWYGSz?)l81}VX%Ik2J zH$5Xke0|X!RCe1?+PUedB!}kVFj75;(oUR z>@0rRYH)a|nq5`I4@|q+6im#d)0^eJe9O(9p)|a40d~0pmMqEwTzXq7?4BgO3hrbE z=C@dZzP59B(iDd$!-Ld8Xgcso+9kE3svzYvf(4;DECX{;yG@zv|4Ognw8`_~Npt-@ z^sq4b1KLRI@VK@ae0dY#6yN|itT@8|so%=P2>{ZTjURkDeq|rAsUX$iNdEjjV(292 z2M_A%W31jkScs`mU@RTP+FJ%_@aOty(lc%X&gVkWL6mC@fK=-EPU!fp)MIw+o;~$S zf=Gz2l$uYFK;g`fU752_EGJOHx<_uxr{fSCIPMTCu;>sd*A$s@6h07<4@ z2v$+TK}5o3nM!Pc*p2`dB`BI9K}>08g>a*MyZ0!Ah(PP|FXH z3PylKLD(Z5E&_9136F>xOHTqdet|5sI^Gmif5C@Te+HGvrG*$=_DCGjPBi3jxO9$m z@=E+Xg$!5)wzCHjC82l-5mQ$%i6_F}{2M{_eFH170-+#&^+bTGb3GIhnCU#SLUIgo z@h=1}L*R%Ami{3W!7&JkXIFGvk0^8_)GfG70CX$=JJa}O4IBykFr1t|GP#X#a(Reu zMtzTRkxLm;sTCY}Vb1Xj0>{|9urMvsjg+i_m6j1v6Y6#|BK`?13`slr0dKsOHnozO zod*a*2iuyn<{O#km!u z1z*<&7!`&bE-wuc9JnPLR50^9^>Y<+DS%Xh*vK2A4VHoGipjo#@ax~OX@J`j*l7-7 z8tlLD3J7kB1mP!VBLg7ELx4Vo?3RPM1{)qy5Q-^^%QgX64EQ97j(CBgw8(|H%X&=) zmV+z@K&si28yH+Z`7|%=Qwm{u&`^{_Lk6bdGXBw~>yMKnH})2!EtROph@qo98BW)yPjU znJScsW&G%Ma*Ng5;NMJ-5LjD{dbB9>MjM7ab25)^TT$6Y@))7i|>T-RqeH5GQIAa+>%B6ueOY5G0Ob3DlRjjk>i9V#L>bQ|o%`GAH|`?f4N4 z*J6-tQQG!*MMv=V!N2t4$X1)!2n9qt@ec)*td5rs?+kl;-%818;K8DA7o?o#!sK_Z z3U~_kiL3!y&_(W2g0PgyhRd(&ESzZjbNX}5SGb=$>TGNJdzZ#p_Nb5TDZEAL*@+i* zzv-Lpw7J8BAvJZ0l$ZBX{wIIGJEKm%y@MW+Z}~pXnjesRY^Fqq_>$v4v`!E}(!^1hjJZ;; z0J4yVR5uq_!KB&7EwqZbM`LNsOUDhLZrYyIRr0H3U?oQ;dls!$)S$ z;QhoU71x2#MH_&LtqF3Uv^fxs37VTU#!%YEBlZW9s&ZCXAcp{ubrTy8%lEye7CzWt zJ-tYK1YUtlDeK^?SZiF~!SU)QnE;%Ku6{(P4FEKof$fJpH^lt7>AyU(z@0k8p|T+| zC}2=h{|5ldb$~DtF#L^>?*XvlR~k`4bwkK-szKYJU98 zU4S$i8oyV-^o9-M`#Be`3xabIY!tl9gLfsM_uU8GGeju8=AdXU5=9qaU~ooa#M}Vz zlnCA#MIw+2OaeSYL=N;6b%;oTA~*M>vT`lv&2uFFQQ)PC7sruKJfQd|BEkzqFLofJ zIbdb@;eky985oEg$fbx!$e=C3pn*f=;++yWtadK+fnpTGF5w{8A%Vp2ad?%5%d2$W zhmjr7B>4#Mgo7bm0^vq5Fb8ra5st3(-O9mO1{ml%h7r7sVuVa6OEHIgsY((VhB%xx z7h#w{zhI0sDm_sGn?mwE|AkY2^Elj`nc|9WqYyYJjZiZP9K%%prX0kPNp(Q7sS5#a z|IUKJgq0<-E*&f3|IL0xbI6617v_lCgrPzm5D`%`0THkN)@?L0!L$|Kb|J0Bk%eHQ z{);5i805*maN#P1d0zyPr>meS3`O=i*aJ6%r3+&_peDebHTY&Nd}9JTWEH^B(tva} z3Sb?)M&!+f5)dSS>ynT$8_MTi`j|#=1qQ(tT=8G|uTiL5A%w$I4u|3c1Zz}X;|6YS z!mMmRAYpXCH63m?t_A|1CrIx)0QkW4Q3!AY0Nx;65TfK!Smp~-CKk|?$PEK7M1gIA zD?cm9i4S6{54#X+Kqx+Bu4H%-ss&j+so$`bi+4O$a!0ZVumRv7bl!q6Yrg@E1NU}M zHiR5b+?*JsUJI2F_)-iEt$pNo>gdBf=(zn~kcEKU3_1%e-oXK3hQP|bL8{9lzygxh zg(rymS$e9vU=HoD`oEZ-N(B74!KY;gOU_^tt0Bg?glj5iQoo^@qCf6!{*+8kf2?#NJO%9s3o3gW zouqOxHLfJ01v3H8tWDs^`ZoA?TJJFEZcPxYPuXo^oT8$*{A=WEpZ%~ehFXLzRr4>x zi7(ffI0>37NFlNKEZEiHBpV>belu^aPMZFu{Kh92|@% zl6`RX4nN0lw3X-M{g%LAu*hRuX@D(s^$%c`0I-RYBnU}`(z_WBO@C2TYUWsK0F6L@ zcMfb4n9xX*Q*!Is%TEu?_w`#k)xGfD=IzrJZ}$h1!ZM)Oyd$thC+6nn#3r(f)V7sY z(EZ{n)*@Nk^I&-a8zgGmmNFNPV|}iBXRe=VoTwrb&njZ~)-8E)v&$=k;b*DVFUXNC zy~b;NvKDJD9j{o-1o%dEM~u%4kX2qORf6xdX$z#tODA8N}|Vt?|&{%$l~@s-7q9qfg|I;%D5;O;8|w@}H-_=qQ@4pT4r z<*DY2O9By96YUa!p#ZzC1(m$G*C|R&onF0+rbm0v9`a^m{>g9cCE}WFZUy`rVm=bID1wJznyXBMJ=QXP?5827SU>EMRY-5I!R8)qW;VmGX;I86?veVPBOi3AVTi*)L4x&;k7oN)I^8FjFJ0#l*Nq8tA9XHi|t`P z#u7L=__>@z2}#;g_JR{Q2{;4D*RTvN!8vw!Z z$z>*VR-l420u^xZgp-fJFx=ouLYzzSlvn|G5QT&`91Ncl+)G6N;0Z2jL9SKTAhBKv z`pYP!sXRzwNFX!77_IQ>#)xiq_~x8y1GfuTGE=)i?EimX`&kd;Ck-jjSYZ>mIWcKT zgtoqdP#~oTA#hj+>{1=CxRQ3;fj}u82&MKQASW6Q>J$PGihifvHX#d>1Cbc`<_uUP zN#ramg~&`upEx684iq1AOZco@fahKyibGHo66+C%%)P=R41grOD!{#nM20u1zQFv8 zI{E2JMso-pRe`w#fYlp9E-v|;fQ3Y1A@;f7SvM|bL2gNjg*47t6$iH z?H0t61=u?pTx{iXMVPUbqy+fu4*+NeLmi?J@lPo3MMO3fAt$hoL0$oB6l5BM0QNy3 zg^+~*re_RSXc#0%<`V-(4*o?+3WQ5ODwyBGq$}7sn4eeUrsH!h9=8 zF2ekYAJu$nzvKX2g4QLTASQ@u_G7;5h7H;_G+v16T7KSQ zCQ%623L+nn@KDo>H1{S7G>8r;6@kAs6n4D=CtZ1T1s%bwGvHv|-9dFdt~|LvkFKRT zBeG-9Vm{LfeyIbV_8>R|MfBH79R_fu>u7AlxGnhWhvYV&x5t6)Z-(SdK@bb%r+5mO zELlGQaeik~KLC~H93m#&0Dnmh@8<^+Qv(0R$=nKJwdKs`Po_weL_#`dA879w%%Qey zK>$ufVM9aGAOI=g77zP(f=~|ZqETK34*ye6|NdfHyF=OcAZVn}kAS>jLKQ%VNfY|J z=9m|MrnQa&eg25}*ZW|x;eO{l7IWA=t@9Wh) zOM7(bc<5p=z%n#8;A(hK_h7yafHULLecjbdOHJnZ0s7JKBZ5qeCCeiWmz8wX7vLX- z;29QRz|Y!~BH#EQ|1hqoa4{RU)lYk^x07L8Ww}I4SUht=s zYIjjzN4e`~$)`F0PBp>1JB_8RUq3nwXATM zet_G8Hv(D4e=h2fZ(Cz--G2Z4<85Q-$QhrKi#@;V0_f81@9x&}KoCsDctXq9YoYhY z&$cF?tXOrVcX*Wt04PimE(!dRV3i|!FO?MoD>|$OmzpTHxG6PU)3sjMSS8qT%?n!? z`#mF+T$-$Ir4{x2{V(?3Jf5nyeIM3vCq?d3AycK;GKS1U=1PWAAv2lE5Hf6Iirj@% zHieK(Mdo={LbfSno`+55Y#BD2{kyKUcila|=ktC4`o8a9@AJpgy0^8~HDBj(9p`bL z%M~MusB9oQ0X$@41zy{EVYO-CbD3A?Ozg&O*e6i6d~BrFTWwQt)p*_{Dtf)Rs%UM) zsITNJ1U8Uu-2)=??gxU&2$G7L#{O^qi%WP7WjQRrWjRlMv=Tb$fh?y3>T)Fi(B%|p zy)OW44z&~Ta+}Hlt@4IU3LhruIQXw$%4J}psGxB9Usu#zcJ~RW%K`A@%`nJ`%*0E^ zD@BmFWCR$+1=&d z&XZXUYG@tXkxE4%kN}4?I~_tevg}wFp3QF{Z_JvY_-4pDYU4ooR>WMF?*tu45!l%N zB4!XEB?wzX7S$NH1C{}?Or1!x|HLeC`=vOTMk!8&dt?C(z&{9oa0aPlA=0-L3vW7`W6l_1=d8ZbhMGc?^@0Nr&J zk_9_tl@9*boJBCu|0G%VqxciG!C=^HK>0QFUjG9CUsquOSuhP&-v@AjfB*oIZ|_Ae zfdu@+M@53EpC9Rz{Y%ZE1+d}EU+}sJh?-wM#{1x%bVq5#I4imAVWF#Q8?XV^y0e^V@(o2<`THIZu~ z4O~=v5<_Nu%ps-$&WqOE{#FniC6P5W^W(>?_pCxm=O6wiGexv`)?ft)vj%>t@%-4e z<(4Z@pj?8e*T9(&2v7s?F??n2#`|%)Xnb7qe3*V@1)|Y`&d)JeiZUvGuqAhGMSG|@ zcMw7x`FSACc}MmsBY|8e6eYDL>T#r0Z8>c2oJv8ISM(wd_)5r*DdD z$oZ_&Qe|J$5oX!}M`FWw6nJ3id+nCFpKlN%1>qo8XHhuPE_3ncW9xxr2B#Q3`)fShwZ z#uM;SKmcr8aE=aV{^o7pnexW9c8to+2BV_iLKpldUs>#1mo3B)8%iTun0pc@KRpeP0emqky9 z;0cX(8)@a;X!)0a$d2y-)qN}w0L9XdPpsCmy!efR0LUeb^EI*`PR;X(>0YcYd5%;E z`g%PT2@{&Y>D1z|MCx7cqM(_AI3Rc_Hi>j}m>QF5gvubq-&Cyan^(4Z%KoUaJZj8+ zjnHH;kNDM~u4)uS=#N155d2GylK9{L7kzwKeg}Xocm9PeY4Pr(Y zAlYt%5K#+iz?1=&aRkv`B7y1@ClS3Sh37U$id+NcIw}kH!Uv>%#u^kAK{lePL7%3$5Rwraoemsn z;M7khZ3Lld{Xlk~zzDMMU~vQg6(86&zakzrv?SynSYVUzS1In9Y-F~fp~m$MFVfgQ zb;JNY!cY?dq;dAJe*}h6 zfqWMwYWq7`77q2i-^noc^RK=ozrqoL7f6Q=!~Xe+%#unt6~2=tvlSGD1{)cftUw3= za()1s@=qZ#RAPPyQLUhBv?+c26$wZ}Kc^U=Dr6221dwoP@(cb9wSp3AS_B-#A*vRH z-K&IS3{s-Q!saN4|A)xz7KC%+A;Pd(!_M(Xn(a#I+K))ro`YK{jGqa>f3~HBMUe>` z=#%^awg$e0*YM#20^R|Tm-X_L5d^=w<3^dNP8QK%65Xf6cmshI`A?XxqJXGV__IM zlI!6E_;v?!^{6*TYm8>h?gn_K~+nj z8~#&FOrcefrwzz2f~}BNBv71+0SK5Pmj;2`eeLb_v5$jus|y`l`6iO+5-;wug3X^k zeS&~KfMfibOvlK1zBcl>hv{<8a!9V7YtX-WJFn-HJ47V_VIxGpTiU>kdOlGcm&&lXMQyp_ zm6P@r4N7aqeU&dQi}0Q;YyfWbcV19guC1eNNeFjzpK;l!INnma^F~{fV_LKGW@oKx zUTq!R@nE>#9$MVkb)U{g0=p$82&a*q5%{8F(&_}{X2~jIsGqs(tc*_a5WHFZhj5sV z3G1Fa=TNfbWcA2ga9A2Z6L>~L=#82{e6Wm5I^$|ZYf&2G_anwrhyWN;HE`;*fr)pF z4`Tvoqly`iAKSo90|^ioi3|`a2_1b9WC3^sKK!$s>W=^7^JIe+m%JHw-Y=(3VjQx68@oI5GB@Qc%TUR-Y zHq4B80+hjbekp@NJQ9J>U+&^AD3&snd*NT&1;jY{y$D*kxIHyzJM@Hc4(Ns`GMN)f zB_1`;%&x6{g(NLqnlV5r?QGy=9;%>PsS}K ziO>C3*8nG1O}QiYf`(#|gpQ$enUv{k~S;4GAwl55vn5V&H!}yc~I-{>QbS z*75=P^H1r0semK~cR0Z!TM?CdN4@*0&TS~987sStIDdGjmtW;-vp)c;Js2$ol)e+U zfN$v(M!)tY#sEYFSAdZ5VUVy#cgx^6fXYlBnhDZgFhGK(n)gyA;yy5fdF&YCBq}IZ~r@b3K1=fzHkGKb! zfsEvC{rDfyh6X5fA**G|2I~L`2_zWvKV3^O>>7qSm4c%B77ze23&sG_GO$k(xWbRb zm?N^?d2+ZEJgD2#;eLfjLJya{E#fvh4ln=1uSAJf*W(0j{)3k3XN1XRpp*y+xUwgQ zm;*ZQ10|bH2)LZzhx@e|e1bSi1A$ayU~LtsAeTF#a^?>`4gYV|3|#skaLOcI(+vbg zQY1AnOU+yG2uR?;F~7Lzce!6@ak)wk%#OR=r6z*Z0fK6u+MSQW@Yx(%Zvr$hobl+ zEY__lAaWS-&BBod0u(KFOq8MBV2k?NhXI*Jad#L>9z>YgGzYOwE)Hhd$=yz=qO z@UK4Mjz*WwSSR|uYC(~I0-!CKdnOSEVYzd@C0B}Lo-%{5Ab5yC2oQfIbXAW;?|Y78 zzAqqe09;rws|4gep&Jyyr*1Tg<1kBYln~ZjpyJC~3kbgcCANXU2?$^@EVT(ZAPgS5 z0f0da5@0_L;cc$XPd6TqU?yLPH+I4GK=J=fg*W*`IBE>HZ1_|b!5H#R{ZY6M z_JYX?kdyp^A*;4PcuC)x!!BR4Ag`m-22v)F-B3VsTpnZT-od-HnmMsJ3!k08_w}<=xUb7m7(Q) zSv1FsUrY=T@-W7hOD}hIfJ8`UeSGfhsq!oWl)xoY0#RUb!mC4f;s`r%C)Q>=oi(}= zx10eZtdR6M?AS#|qv;9!b@cT4rQQkvZrC0E@F(FFh8o=EQeq#Mqw_#OTiZkLSxcZN z2TfrDO#!~5dh}3N1&=Ph+mrc0?lQEh*5uP)bPbe8Wq%)tuDDImXsd|e4tGM32omk> zj;cBnwpVM;@bwYG_l9}r(b1ox4MZEJ%}BT1lD}V{q*gK-{_~l@@H>@ zVu>GENK?jts9}m`?mAahEx_D>P+bANOF(!<;c8qdDM7$np+^Xsv>0UD-ulD2qyt0; zAUyHE1Hi*z#3>~Ld_TN=hWI0>;UyrFftR9!vdNAd0|OcGCs)%AjV;1nI2ZjQWXN>T zABk~(du8g7`Yk{pv5H)_z)Nz(w$ZO6$~;o@L5duiQkln4j%+H46we5kV=nT1Cu}O& zp~Ra`i9ILFYqDm@&Vy`LXy(w!Vg4L^{68X?VDZgY1@} zgg^ZezmyCgeu`AKk>Zx2=xa{HzqBBK$v|9Q6c7U|OGX|1@KO^gIOCUEBORe5kT4D- z1a$tH69+X>pO7 zw6;F`oiehaJOiw^Dca9MIhGji_k(^PE&-{82B@*#&_AFDC%N%)2eC>>1z0EvC9^}~ z)nVen>-PW-M}#NdiDX*m>!&Yu0{axvWMzs51aEqMJ%9WJL!$T=VYWly-U3oW@{6;O znkODBd-d-dV=2C$$RFyQtYxUZl$v#u%~`duDkxEH2BeIlCJzxnUZ!YHkfiX}oxw;O z^gtfFU-j7u=1k;TFwDu9D(L>J?lLAwnt}OHkGZ$UF+m8}gArAxXbbj#-{wLFgQhfz zfBXou9_kGh|4{@?-4iLE0+RP(HgLiTGNN#{5nhn>X4QvOL9W7IjkZP}UjO%D=Ru(4 zxt|;gJVW%UZc%C|dc-vbW3jT!O<1+K1^*~o2NqNW~bWb;!H#gc>u^xx9swRfMmK1Or9jUp8f1tN*1NaB^NLfZ$NHJa?QHj zf+)Hm8#9>KkUVN2@~aA&MXFDY4hP`Bf@Mxlh^RLtn6!t@|LE@?Am>1g6bH1m8>L-j zi8ai$cmFi8J>}mw+@WZQOqn?W*UOm!@@~=tQnt$t#)g6J!vC%m5-kRP`T6@7G4efI z=C{8dT%S$`nH-Tvk3v-w5OdGlZ~VR$iM|I96Mz3w7ziTHAOMd1;Yd=eA0K`>lO#*- z8H1xRjQ++C(uEwMJ*4ex-19WZobOA62sryt!etKFcpXsnSg6?rVF=UEAHly5!e|iG z0A0&nT>&tGBizJb4u?AkwFXm0_&}(NWJr7iVpiqAX4LoOSJnWykHTM$p)gu;i}SI` z^AlS`pSR%uy7!50yYFH zc<#y54{WwtgA6>M%S{}Pobaw1xSi88K=F@$e?Tq}$g3Fj-DYuw?aL?*TyxpW!sV*0 z?G%DJlx9kuCKn0(XXw?53F1{0nXy9Gvs)GspL=wC)i{+R5r>0u*70|EMOwP1+zmwAKYzf}rBYjEp z%gfz_p^GuKs&FR+!bGHpfO0C&g~=AE#Th^iJ&%)X=o;Ycbjw4`Zw<-0^cqQNy!1x8GC zKo$;&yT+oB~K-Kz=kR zV=qpA!(`i9ZM=1hqn*FzrXqm|LiW^SrmWHXUru=gz6vJL*vO_!Aclp#3n%?-B`t#c zJVCr#O+|3q_>_L3%MGY{g3+cvJrA_E!{K(`N}7&m#OeUQV4i`#TWX&y$b(gny8(+; zLUja%(&K={asuoMsT=gTEybml6CgNo=EIRz{2)c0J?lR8#t?|Gt8c>MrxRo{bAlge z$9QfUMyuu(bo)>>zWm0!j;JF$e42Oh*a*8rlEAmy$tJ zosJ+nI$S)P7I!zYeJX;~|&^?N15n#+X_v7T~K_>1Uth2XFMc|N}NRR>fb^TULYZ^+N0`ZIqG7dqV&Fgfg zbK9*vN9#-j8JCmqGy)GwX>*;a6N=RE?G5MM`5u)~zvm87SH8O{>!a2L$if)T$q$S_ z%@|rd&jPy>Bh{-fm;%#!=LA3W^&8v);+|6IS^mbdh>{1g#DtFu4CVNP@_y*o6^nw;??+~E5dw8+5(aoc7pTHTaxbC0jWXAz z0oLmQo;bM>J`AbnA1Oj>3f>-kglI2rffd-scj5k_aQYT2qC0+-2{IvNFTpaeqryNA zY558}kXf4t$tq&H^kOJ=|Fh2oxP27hgb=VMb*DBQEDlGxqc@LAHjLP@D#0f2;LV`^GJlO9DtspV@ zEGgC@%83D60g|Nujb)QSq7pxOY_)^h(@!aIVuA|6mZzrXI+I1ZQ)*eKfm%~UxhtT6 z$X|tl)K0Pz-42^R>6i#%AnB*p;fwGI0R2+j4L3XFNy`u{p&R$&BACl;Mo4R|$sDts zKc=?tOe;DjIE5m}NlYF?y0vjpXeW+`4gx8BPl$kpm4vLx8bB~rGu)+Nk=ZD=rep!B)ifQ z)sfaz&7;_d3|IqIRMv51Z8fcsSUY=E%z&cGG34%?`8BN_Io?nfM`LA z_-Wy@n@i1R+iQBZOr=i3?QEe7$aX(kupe1kfj7BHL=yL)@HiL7D@PGU8gA!*sBL`X zeopYg&&~3ZZ#Cm&@I)=KMC+SOAZNN($~)%%p4wIt(a%+BL6?OEo5))B#s13{SXJf( z?d_^0mOTmOA6CMQLPoC{`<>j<%r8B@DmL||CxTaV2p9`uo6*1f3RR=S>Al9 zy-@!mq1>l6-B=wZNKPI$YdZHg9`ir@)F_GJfJ0ZhxtJhEZDsW-Xh*JoRrh|JF&|74mH9W8|1YL8 zrjM7WL56TH!H~0V2Yai}HXQKIL)dw^`!JOicf~VlaXT*e)Cw|} zeZTJ7-B&zZB?k6@S2@8<-~$LZgJ4y}PGAfqp}e(rCB!J?6X@3MFDl$&ivzo3+&!%F zfJa&7FsL^_knGhODwcqZyE%2B*tYi#2cttJnQbkQ=7vwicnG(L?!z4%Ol(K*;f-0c z@)iMQ^abKqWwZK1v_S?D6|mq{KlZ3prImH5vd8tl(Prax$@M=ZY15@K?s`@+$MXmvV@y+FhrZZ zP4@Q2fz6U;A5TM2C$KdSH}--EPrwA&jBv1A;nsTk)o!?_wj?Lm*nSnPN=`+BR&vap z-5`zuU;(c-m=^=YQ-RMMAl+gH5e-VE84PY(!&<2C^`<=_hg$G`&@;zC&s6$rS6%@p zAye?_BT_9~gmA20)eDGiI9U?`jehcE!ay)sEP<`*g{Lu&xv+0Ohvo!33AweU|K#m? zG@PZ%f%Ov}G~^T#amy*OSUe3Lh=GIJ9DrXT>x%Ax`EEd-?^~mX&-vNi151OH_Ud^M z9_;D2W6PJ22|2)o?O!XOIfR5gi!Q~h%Xg`ndInfMG&z!2%`}w71?FZ?F?hAgH863+ zVLMa<@UEcuP3O#6LGRy|1~4NLdL#hdW^4#&Ey#En>|;V+MHsL&{X8aftAs!01TV$L znDY9Pjya*iT<6SrTP}gla{`tYtUUEY*!^Jt{?t>8Vk-OI*%e}RxQ+#DA=PRs8(uWN z<_Q{j2TorLMKn3sDfmt(U`fUj>U&gf$-A~SjC-m7!(U<&Duz)5?8ww?Z24bsJ01wG z1wHx}oWKFqu^#ExZMNHKS?gFC)5Ei`C2$XNf`0&uUMKJXm?u3bSH1Z+)&|j*(B$*1%C&SpkgKS$Ry+BD%A4FD{)jf8l2%!g;`#Kmx>IeDlG0u+;M%uvVE5bAmNBo_=L2`^OFpm1U>h;Am71 zkmLqCfjy}B5S$#hsRQsKS$8t{9c1U6>XLU`+Rvk$$YOwUTpW!Wc%>*g zXl3z(F}Oa~zS0CNISkk+tXW=zTA<(#@C43;qpC--{afP-sI`;yxOq7lPK*~=_cut! zf|$)K0bp~EK!Wt|hQ<=Bh}_$4IqO=fj+JNEtqWJLpM4Cbpvk~$`15@@V`8$s{2Z{4 zOD~JoaC2;c*fJ*d)DryUe3nD2{kXvYfAsU~_mcrTGqKjb3f2caf&f@!BG?nydhqax zzo@9rf<)Os*#)yH+;0TT)-J$?QvLnyXb_$vD5JVvC=S|8MRhlnoCK1}5Wpq^Orb!P z2^3q&4 zG7twQ^m0-S*aa(6WMiO6KsZ5Koyx`mZZn|1_TWqpCiaCTIDr`=GCKzNlO1#>F4Plm#Q$$rt3K)l6;S+;9w zpQHm)S^3u>D>ju4uNw+{1Y$5h>JBS*7>gu1sm7ZJ;TN32S4X}h{rbU_6R=iq`NDTl z>8XGs?@D6a+b1a}gMDU5UERs8FpjJk(x@|>8_1)kH)J9D$ z%?Ak@%auep&LykBUw>-LdA*``9>aj1q~-EN#mCE6=~&`@tjoc$Ni2ax)x@nBkobIh z8UtSJX8||~7=^kAL{tGnfvO2m693Cz;rf4gD>_f6IX5+rCA+kO&r70gN4gi<$vxVx zdS(dd+Is2q{I%H&!nd}&lvlf3-z`K@CT+5tU5beFc zy1F);nt@H{D-$W(0Wt<#te$PGxGa4-p=su+8-BAhFJI>n_QS%Qn{QP#`mH*u!(W=P zHFKp8IC22lSkLEouOWBSHuakTMv!gDzrAtH*HwBR(-)Y#d1Seqe?eE;+BWMeNMqH^ zE*8-4i>{Ko=oRReK-@O!1Cj0cSA+b%=Wnf?u{{E5NY}oA_nWw_8T&ro{CzlT{neB9 zpS~sHeXo(BKYw0VSeCvz*${84Q>*>G?Da z)POhhS}ke10Ugp_whUaFZn=D;e(+|H(Q-NOmR)QfQ9;O$FCBSXjLePP!2tX} zxGEa?ma2wX`Mq4j%Nmu0K;h+yHvISAIhToWxi~vRC6tI3JAawr@@D6izE&E`8>?J? zv`vc!`tz>yicy+0xTQ13$MQ@Ye+-1pl-<7xJCK`UNUf?P5mkYWbVNJ!j&~lfnwAtNpoK+_I zz%VE{`IQt0y)G~Dtdinc5SsQqhgO|9US(1d=i_fX&ne|2? z?a?HOzTwuz7vSkJcIO)dS*uoB<(`G{B<|M6vd*8rD`!bTS=J_|H#fh4bfYrHop%i4 zta>x1*Kh?1#6r&{6Svg8VPmwW7hO|G-JR;i_>IhpGh>8|d_51HjEYO7>A~r+f1*I0 zWqB5*T9;#7qK|h!!k3ks?kftO&{zvlJksZguTN;t=zIQ|ab@$ml=GHUM(2XE2uGAt z`B3weT`vRbhx#$=`KOz?UxHQfoJ}CQpqzf6TeOLGJ1V~K~d_{y4YZrgGQOh zBxs~y$Q9wd#_^osDlZt_j{kB`fauipB&PAHHGBSFPgGDIG$vhg0;!?oIB^+MLSR zdmkkHe)gfA?)hFLzV;|yD}UX9dRjX-AJ0+FbAaszX`x-s>f6vn7*j3%Ht|v1dYb= zVbEo*bGxCen=ms`k~t=a8O}1Elz)czG9?6DSPJe-+zJP)2G;S(@b_Sz$3*Uq`GylC zvx<^_Bc(6pC)mo%btdMvhm5zS_0Uyk=y&5ry-B$3wzBTpaRblH^!324hK~e&rz$MT z61>qjsF{5so;5AD~>2%_f`G#kP zmsSVEd~JX3^(qlYYQMCHoq}8>`?xF8oy#NKgumYH+nQ0cD#t(LS(H?1)&%PoSZ%M* zgEKp3PHim*ds$77j^B$%E$B`QL^)ok4qf)r^t{U}e8%#!?~b&xj?S=Ht?Iw!8gP9PSXSt-9OA6%vtBTzExDK zmG|?r?smG~iqagYxz1i(&KYAl<4L9Mc}dysjR`N`$(X8ZuP+PObBQF#tW1oRG!fOb zHZDG|lfj1)Q|ncBAK_L-`5iX|<=$B8ztVfQR_SA{OJ@hzkuS?st^3%Tb74~Po7Yzi zt$gcCn9^GpA6#djA~n}mYb`x)R4Y&Ch+4w(77{bOCMEHAKmgopbzV?X#IQ!1`*S{H z{#RhW^j(whW7k|%s@+N}&qG=`7#vhZJRlY|Ukduyq)ITu(B}gmuBz=i^`l_5s>9U; zqipOrF{roJ=iRzShD?(83TGZq0A3GAPd_c>Fq9c?mUahxzn;TT?{BQ2z=p(dvB$Fp zc{PI8uqH<^mg5SJ{vS}WHc#|(w%YJ#;YB%>pd8Xjdri#)?G_G?w6K?sh685HPkA^z zo|u%jTU;SLZX|7k4$Q~LvrZQ!5(PxjgeB1bGVbRHS9HKRwl?82o?|k8Y-~zg!gidKT?u!enfPdmm-SnTk@xfX!rm$)$F~>P&>BhHWfI^#^cnZ@ z)ikZD0DW{YbsSiRZP+`f*cQ{})#8R@j|Ms~dEoC(E}1Ey7!`CJCfu`wwObYyaYlD1 zO`B?c)?kxT!19Z$U^)Z7i5=YALH%-3_oZ)2#FNeIgk`}h1|#BKtlth^zaT+;xglyI zLPzep+X|=1O-AwCo$ndLth)p77q?p%G<iKX*Bkgs}?9r2mecpPn$C6 z?35kezZd6k&VTNm#htHLUL}bQ+Ys?0iJ{?dPlyNL2fgLwpZciZ5M2G%p0TW5lyA8x zE0NS$@oXWPR;f0mfj@Uii#3t1;dK{{<=T7y$9^u@j=!<$y%Br#jZwhiXk5o$Q`RYs zKNs^c%}OM8F{@3T`Oy<*9YclE=hP$gdo4u@+9puV)Z&5oGd&T(mUomN8ciQsxV`Fq zN-rXhcQ}6K>Z1g6cPXi}gghH1^8M@C9<*Z>xL%p&1d%d@?IDaf}<55Q-(* ze0VABEqEM5{p6dSGoLfnl8D;-X7OXB3{p?Of@j+sPK@J$(*`n?jtzo*7jo9m7Wl-x z`Ak3Q#Z#kww)U7<^P@2lXM0ZNWylfM1@s=CG zdY$Zi-etf#X+1$tgG=_&Iv*`1I~I8kNQ^MQFTae2}}HdQ%!yzqeh3&gxvJ8&SEa$u>hf zQXC(LJN_>ICdzB-;1317@oICiO9$`J&I;#?m){z>ffl@Fk)g4|&7CRtB|g)REM0Z? z6NmT5@Llpyu((rwOP6u=)XC5v>Re_8H^nWP&Xj&^$V{>xcm5m{ko&EmmNnGv(_ofQ za>lL89%Z@pMbcBttyUV_5f+=Kw?9f$t_)mcu4MchTl2XKcRj#0`F(SDsC;he-6yAH z*7*YQ*aKf03}5=FCpbO8U(tO&#FzO_=aIkpEb+{z4SD@fyA<_5)eLUkO*(|NQp)9} zn`l3VbB+`Ao_Xu>xLhehnD(&O`^=LAUxNa3ld^P5`+bQ~sY0Vf2OgIt)Q=G1-l#C^+KNqIOEF{jXtCloX z53l97)eibNY9?!o9bu5Gj}OQdOBu>^8+-f2xmNAvNl=MFwa&nNA8nL+=9g&2)@%DL zVv!hXTD(OfwGSMt)0nP( z|MFWAD)f#QUAo@M+k*80&2o~UKBW(*S&AHz4r2l?j~-0$uIhWzuCnu#4O8m<@eRT$ zOmbb;oNLsl%zMczCUaPxn)AKyOF8p)5BmU`a4@xhi+Gm@(|J(kuh#*&?dx;KHO?Oo zcq!%%WnRS!Y2P|D8uikwy-WH{ePFJ1x1YC~&rRW7v;TauI)uHXB$m4nV>>jdF(Qegqxcxt7}M$)EdoMiG!)|n)X z*{dGqizh_p5)U`Ll;Of?kUUch3|@USHck%@$hFJZ9Qn9g<5>R61)drqpokrE4y`Bt zn5I#b6pTmxrIB-ko5r%&$lLm)xw>3ef9FOZv(VnT)yURpQTF9!39W-zTS7qT%UfQ3 zPnW{ABQsp>zgVSpg$;HKHr@9JKCM&EG){3X2|lFAwOxvm6((<%GW~E0)~D+^6&MN~ zlf!&Oiu{+*Qw6&pzm9X9PNl6;=8T>3_y6#P{>uwFO_s-^IVTzL9gVz?aNDxNzprr7 ze6o;<*HzfRdD`jj`GvnmzS~qO(#z zYGvh|Vs9PQy&%1-5I312%hIua&LA&6;|FnUvZpKxJbU@xV)_s&Ln!h+DAqtno?5nh za4qjqTVzvad)jf9ZfXUWL77w9_wa5SUNYO=U{!3G|JgyU4Cek zCPhQ|X`!WVSwrRqPtEbOD+iTTs!N$;K6`-&X~otCvm`^@h~Dv!huT)})v%}ZmaepA z&fen^J}WTG_&RjNEgS~|9d%xP zk1(?s#dz0-E)+tzrA6YQGQn|Z?LF~<8N%& zE)O?z-om_qWoltkR9y^ZhmgG2OWgjvRg2!*qkOni(Sd!xeP7p6&8=)#9{V*F2A+e?H%4HI)(3 z$UiKhRnL%R-Jx2yRas+`Zy;0aR;7@S*V6nrz>#OLmwq`kHMeey zc=g6Q>-1KEvL$1aI>*TmiYHnjVwrkkbFt7{*`Q}dq_%BL2#HO*cy zwqz26^dc9iIa0aPHxq^Y-7#v{#Je~H=W$K@r!pSesD)z8?z4t+u_c!DGl$M+wupa{ zxzhKgY0XF6CA!{W!Cmn|sDSk(NSBbT>cZl1F>v>f8~LYMZUwe=&U}--7GY!*fH}rf zvj!A(<`O1<)T*O?O^!J?}JK~pXuH}5(|Aqs|XFenn%ai!e z%zaqd>2Xjvna?K4VT^ivhjC5qyY<2ZAN@c{i}jrsP*WUAxoav>LK_~(F=lMeG8R>= z+Wi)CmitZibODkQ1lD#pSscPb8-5s>M-xW*8v1fjA8(Geovs$xb zc9Yej4+v|-ef)XSZqGu?fm@u^FU`7_vFF$S{-C+GDm^r7Rmkh|YVeU!tHJJ_;3Uz! z(qbbT9mrPioc4G9g3;ca412X* z&eUHe{I}e0U4zoaDYJSf>&#ZCXZ4eg2lw9BW*1Gj=zgc3;9WNTfHmgIVewr%&NvWl z(hg&%1U|bkeTqA{m*eF!ok-wSRF+$;t3=(hAj4jc2e^c?`DBoZVV6q|Tl|9xmE-rp zHpnQBJsPNO-1}_(`^NS{p8am|PS9Wp=0*wS6PT>S`A)v>em=L^PZhrhRSd!jyz9%& z%b^(^Y~1YhHeq2gE8hmq-fLK@pxc?nCw|T_x*^r*6@!dN*?Fp62{L+in2IHR(h)_1 zVp+X!$0YUE+f1wD=Ral;9t}oq^MXS_%X**v;Dx=5#TnMSKdIpiRfXgt-|~Yrf_8Ui zQl~ZZ(UXP7%zHWRcg8KOO(w%fdaK{}-?VS+A}T&E+Phl_DDG9_EWFE0b;kt)a(Ao$ z)NpscVayKtWZ%RoF~Px|PxvqzG0MaD((J`Qd3qlA))SrStEiCwvi0;?U^03nR^pYX z<#BJS%!%(-Ep0~>^^X=ZiaFl<92PTl^6zTULlI?dnbQXgzJu3fC*iz~{o~j-4BF7} znDF+gw%8J@*x=sA-rkN~qcyq73f(?xDD_9D@+5-YRzb&58TP(ulq}`LoQ_l5riuJo z+|b(#ri29eh<0t+zJdil^;3Fv+e=B=M;KQni;((Xcy1%YH_Sg<6-r1@+DLZ?Zp~b zmNJ8vcx-jke4MKaqjX~PL({88sX-lWT|<1)oy|j{?4?yhjcn0{J|whzZkc3Xk2)94 z;$I~1OKURwaW!o=oOV*w;XOiK$7SWm)bW>ay|lI5r-umC6%jtrC=S zx5~#=lr5}~zGDZ|0YDri?uiY==ik%lmbyh(#Qf^HwDekh*mC#-buaz8v#a|H7qkkP z8SO3}N-n8s&JMA#Xy6>#g-{C0i`|&#ieckJ^Q) zZu&M}d~4I@VqeCj`($JD<2^C$V{$Z^k43d)I`VY9=?!{fblz_bt#W=jY1?-E@D++p4Fsh5snH+iQ}0 zMPDa>E8=pS%=8(_t`S|c^I)w%?K3)xyj?F z-<b!aSI4+ZWXTxwJ>VFvt-UHJV^rAf!l zi5_;1F;)o=o1)*c+4gT}F{rYX1&NuB)QrYn84yWTT0UfMpluf>HhfZuv!nQDjq4)! z;7bXvZ7CC?Qjvw!^8;83C2U))j?SG_t8Z$4uoX@Zp z$FBE=D(dT!x17kZMXu*~UeeD{p2-}qyZSQv?)2ghzD8O&w{PeTHJm1(%T|qlJbfSF zyhRg|y@st0q18QF=yaCkZIfc5d<}gj_ z*2X10xbo@lyMSB=vjltBFT1CCcZpc}CpNFMj^7kJd^lKTLB!izR*9sn2ry!s)$(t9R;~7M^i=Y`i+7T=I?1TH{zmV3CO7#G9bgS^TGM zt#{+^g5aCixe77asJ_x;*}LzGWLL{v++aS*g5qYg*QwR4=<$wUI~)F7V`phkijARe zg50hIqK`I@u@{C-EU>|3a3PiYDC&5X_w|hJy!gm_T6Oe^VQps2kwprY^|Q?SAU(s~ zch(JKV)7BfPl;bQBm9{~2Apng>G<@i6^ zWiZp29Xwae^%shBUd^W7(ir-|mH4>MQge#0#%9=xf$2%boqIu7v&@YypSWG&RIXy` zAAmo#61qBa&hN0&LfeXtl}5{tZ!v6_Y^Ak4kM40Tyn0p+fBz%0eue-ES+HVds(@?x z79DIIjaT#4a(W%sHY&oyEH2z-l4b7DDSKH@YD4zNKiZKTaCh68`P;koD1e6;>^=z&GQ1Mt?? zQ;yRkG%nahrolHr?6lOs`{)If^?uAUZ%T~l9G+pvlkq)2rMycRk8}Iee~J_-#K(p$lcd;$MZeWjoyFvK7k+_+(9^D}6~3UpHTU z{+;L1!HG`K+<@dKeiMRh#JMy#>J#_x2v=&Tm+;laevwid}e?8Ds& zGIzhN<98dqj$FQgy=maM$7+H%S2|t#{!f{Kn`NFnTfO)C@!Vop*!qct&l~rbcKB3R zgT3db)07Y)J>hG|uKkCfc^kI#$8Fj8l;#tdwxaYs#SFvWKHe`7nR_)y|AL*z;ytg( za*Lq3u1aAc9U5GcfsE8dh9E80F5huFivPVhEw2F4`m9>n%HQ-ubkkx_TD-)47x{PY zKa;>`!7~{Xz7nN#Z7StN@3;0mvr5mJ;*BlkI2|vw;)Wf7JCeBV3AWy0!M>~m`mpwA z+zWMl-iT_RDkI1RrnjYMJqvsOt!qRwzVNj@YQ|=lpokr?DG$VlzmCu%3*eTEn3GNu zZP4Re^24cj^~VS78y5;2OQ!Bzyq)thbsw-LD|~%CT>cMdJSG&x1@VhW!mxj=V%Z{fK?1E=_7 zYQw?GA;nmH_u{KG>*uDZZ+Do=tC0S`T_4&+M73;GmW0kES>|UPm`{p}s%+1;7hN4* z^LKybps^tSjbTFTLXX6tL!ZHJt1}6PVc$fB_a{oM_#6vt_~DvactkBR>qpZkz1YTk zY6c;z)YOXTk#~cZ34F0JCeCM_?tCg>e)WDJD%$&mvop2nTT6!*22;^t`h7b?i1~vF zPtC4`WH2s!N8UWc{{G{p zLeW|GlQVnqyY{Rcb#SeLO|dtuYUHG638{;TQ{C)~>>B#f($2B`;oj}D=~TM_WqSA9 zT&{Lu!J!6~xji4-X`4Q{TTfm#zvarH*7!9XwXV3p?QJ02p?g7y?+hS!bv+KS@JQN& z%7-ZChK2uS<#&rg{6y~i_*tE-nBXR_57yK>^Lk!dcL?1a3}FdF#=$bL z`f#`NK~D88yTG=}X`B+v$cox#Y|^ODo0w?ngZk&UWnvCz#_aPAuD*O4b1tpb$-W)+ zbPLC#^-W!J@Pk5k#Gz*U>O@rSR@?-ca#72%FQy6#N%PgQS9nd<8o4Ik@0yMS)s(l-ujFCQ;SowJ6yi- z>*uZxN@%p)63Cn9pVVPeM5C_B2Gczo%P>w(-Iqms`I=&H{$0;UR{Nygo{0^ovVRTj z5LM1Qkg?qKX+tIeUzRkZj4g}4aS<4Sltb7_H4DSpw)U_=R_vr@h{J2H&7uPtn>%wK zcxEyTNdvg6Q>fQ1rc`rJ{``Dlpa0{&*o%{&BYi5a$UMrA@I7WJ9q962Iq8!&W zRx#dN)3w^~ypHB&e2wktuy))5+R)RM&0`nfH*$UDs?~&_Nzb!s+u@a$R=G|FMmJeY z8}{nNQ!mmCb__gz)kiB^A3LDwQA{}M|M(KhU?3;V%kO=g*G&(uVTl`;yBV>l&V%!~& zHQF2_hEEBv6z_)B2e~cvZ=I2&eG(zU)ez7YXzy|Ec4A6%-mDj69RBEonH}qmP zc(gRL78d1VtkYhaDO=yV%DsPKf3m$~Ddynh`VWT^*RET^c5_)AR|GStd&`O|$}n=F z(c=;Rk3o-N@c*a1@BW7?`1+R!2_m8g(IR?Ti54X`ksx~SJvxi%!KxvNXh~Qi$`UQ= zs;igiRuFylURJkQZSgGmeDZxg-@oCRU-os+y?5s9d(NCPcjkigQ1|qF)xpXRAA`0%omDAbW`^^&j}^79)FykX|kfA(hs@?T9msCaLY{&pPV$?`F^^t&d?*eHT6qJ)`OC#FJ?DRd) zM&`JF?!e{&INOTGD{BAM0sr+&R)GA!UGrCWli9;QO0ctAHJ-8$Z}hG?ym@Ri zG{C2Os9^Fi-M&W~7Fm|Y29cD1p5IC$&$gw(m_lvGDd%?XxU!*?t8&B?8klU-PZ3uY z!$7HhB|6fJZVRwK6JrtSs6^SLYI1{zz?-|%#oj_BKgtxHZ8cYWD{tNQ2t}VoJ%zss z)sbcz{qlagmuMTwc!J|2scmK6oD0W!_i@2P)W~q#w1>pgnxDxELVw&vl01FGj#0ZF z61A(hQp?6R9ws+PB=!)8m(@Jcp6{@YYPDeDdq+nwP`9}m?!38GV5+&d|13s1C-qiC zZ8D?sp{+%j{GuvtwCbYa1PTIV58yeS;(am8DC8|i)n(F#eJ1Mm=eXolk5Iy@Aw}cl z;27lX>oG#7Z^9#4Vpeu8#4u~@KfS+``9&0LshUD9tI1p}H8adFS(Gwrbi>zb(DShD zJM?~T%c-%q6Ymm6_-%6*)+&`%KQ-x?=9(Z2ZX4!hA^pGK|HI+`lY^Z{`n@aN#m5vy z%VHKJcqawRd$qVLJM$e?+=Q@H>*n-67brkzd9V`ElE5=X;*PAtP7ELQ+j-E4nl--_ z4v4xPl;D1_WzA$n+J7Ana-HG1!ZRTCemS;%LsP}#PNWBMn-?-{s%U~XiiV2TaOZI;&A zJbzTIc%hhRClxmd?%ldF6`tH==L;vk2FuA^GHv~3C!D)t)SDf=uopKs>`^}EX=1S_1;!KCZ5*DXvp8{UDHmRZ zU8Xf>XY^r&o4rT7lZ(zU?t!ows7j+X6k*<%cOWS*UMN6%&;T$Lu9n8mPCeul0?2~w z=0&IK^8yNj(HAWc2GI z!K@DEFy(8r{5W1%j%J*iH9q*B^@nqZ4j+4Yc-(}6@987poI6cRAZd3g`ppj}Z1(d> zY)MvV0J4rG9Scg_5yfr6l<=FPdK0#>08xreK#b`HPOM&(6UpNbaM3&XkBm%ZRlEaW zo>}udDw~XoE*SIzV#N5>9`&O4J*I>Y@uN6W4){`z3gj-FLeXWhfw}}%Oymf%NeFG4 zZ{U(r$VgMe)ZpD9PuvHbyEiN*+3$wGXnnZv>2lEcktMP=G=qsD%zB|QF~B_?R6u1d{CM|VP|ZH0 zZy>zSVV~KME6~6y6lT(qPUuA;dhId}zjx#LJ#QlUC`ZqWr>y&Y(6L2>(#_!R%^0~3 zv5E*jc2pvK>M?GwYncNUWtUk>E@~F}>3De5^oq}eGAIrMLWpXA^73E+-Q^>ov+9?cbZ&)E^$6%eCgk>XzPeR$Dy1NKBp_QbnObxr$OBR0CX-oDkrTDy%;!2VqcpfY>U{)%VM z$*wAR3x=}#4$XQl#&DQL_n^?ka;;()S%Y1^*@!^6WNF*>jfIxM*5CUd*gZsY4Q@_m z9o$am(^@yH7rm=d+4*hvt4J3^_heJNPCRp7&; zeC+WZ_hEQ5G@*}ghg|i|d;Y!zs!YMI(o&TjIyTVZU>4MzLO6^zZS-Fj)9 z)4FP71=$_YK0xn>6&(B9%yiLEVW4M+soOUl{JL+Hv)16bFxtIQVt0FJJFAqB%(!#e zw4?^EQ}c=!zvx?jBE%hRr?@*p(;N-gtP>FjnbrFr%ca4w*nyf_k6K68YlqK21st;> zz`kwy|wfZT-2_5=L4$$oZ=>(5rqH+3uWWMSmt_4{fv7Bu9Jki|DkyVKSUK&o$U|c{8R?+(|cf+a&PJ z=>8$yo^pcRYz`Nu~!un?ju~io)^nXGaAHm>j@3 z6YaQC?0tAaO6q0(n$cW-jfh7uyYqw;3Q3=zQ z$#}z@=vjdX!phb+8U$v%PD#854_c2~@0X(pwC#EZpR}9h zq0?~qQawq>xXIztrV1$_j34&`AvcNbXp2m0bj=MK>y!92CB2ct{{DxG8;J(kx3B-g zNhwiseCztGQHc8c=XXP06W$D2oMFAhzq}PqLAX01_0Ce#V#yepe@jE}8(!ScR#}w3 zGZf~+#H2sNM5|q+R=xQf)G%NiMrQ1O@XXu!-nOb= z7k%v(i+ymACj~1G)3O;ix+gem?`*~%C#IB(dk(JEqd7N(M*0~JEh`@9MMSsPmc1y; z|B3fd2eBOwCJhnXqKKyV>Y#7T{DG|uLpLZ|(X2!4N>)-4Kj{T*vX4THHRGN}&F9qs zl%Mic2mQeHK+M3@<6Bu2YbPu^4NE_hGy#6+d``-7;ZfrahU^>Gg*qIzP7@uqk6B0# zg6Rl64TBiIIx+voOfJ3P5hcE-W-D*Q;(Wxrt{zdh!#~iCGTw^38j$`TfNh(GPpu(aPfHBxHix0=)HTP~ zgAQ-}Q==a|*E9NEcULD5kUrJ(TMwn?g~F)!qjOn-N>z`&XBj^i9#58gIH$mfQ_j8* zR#?{8?L`bPDXL~(--O-QL~}qMo)tKG?}|w1ROKp1(AwD3ANO}z0=26!>?M7y8@2kP zbDKGg6In_HJx$1=`HK71DK25f&HH&Jzb)3+HoGhLGPUcO`?_eoyX1(q5^5YqIj4C8 z16+os-+`7GV>qU$i{t8_FK+ zSmgHOAqK3GM_+ z$NBqqC7&~uOeO%!#yMIY7WD1#v$AfvfQOpzDsOlCwc#c0x^lvm5{hqw#zK?HKP`Vk z37JDzb_7Oye!uUS*t{}rU`aK_u7qxrM!y8*&@gv+bFvEzOXoBDrjH&c8;~^QPN{rY zw5L;5C->;;q{h{A}dC@at8C+5aP4fUxTkXKT_Os2aUU)a~! zv8?po-EIz>Br@I3qM+gmcE)#a!;F>(XODJmQOrGhReSxrSb;%}*O&a0&8fxtcb^>0 zx9l#q*gp}D#0(?s8qYy54-SxTmN`D|XiD9ijpeeX`!tawy+z zZZH2_Cf*cl7IKeQ8cMuTav`c-(o-&QcVw0CDcTW@6WVS5BIwq@nNE>n%KUrgdbF*F zRE;42=1z~9EVpSM#Hl%KV-Srq(>&{wEzz! zv*(<-sRKu?@>9%=3a&8Jv1#V}BX6HjtXizY5tel;TeijKhtVxDW_oyvoiM9CFuLEs z`)f&TS5HtM%qgeW=6J z_;Tj_Fa2I;268{_2;bsXXaC1pR7uX4r$PBAiEK@3t82dP`l9fxnn*LY0u?2&UX5#7 zVoL3X--vOqjZuiauoE4}d*jE>4qZK=H`PGFkOUfQ4izdd=y?uSzgKC+lG$1H9bwk% z@8=~02Cu_E>n8?p^0DyespTcMj0{gCn8oJAIV6JXl;Gs}I@SnB=6=XLknq5Y&+fGva zG(M8~mk4i~Ps{f#BdtZ7v+SgTw~uzXCd<#Kl&DAxXmP#`4lH~lqkO!XTmL)*f-Mro z4A&h;$4MWw<%lCE89C!e&yifC1QL|^00#BCX9*<3{?;^!lxND^iel77fA*bB^|#@O zMJ$qd#vc7~IZ||)TwHPx76aKTxxGqZpRJ>D*7jW-WPx%ePC@jDH~+V)QA$=llXIkkdB5J9LTXl2-r zuI($)W3Q-OUKwktJO|DhHy*g0mt@aO1W%U8Sbj*w0?D9&1;jjG*}i6!t5=EZCr_w- zjnoTzp69e?7jIZAI%}j-ozXucvw4)p)HIU*D!D>#H^mBN>A1di6WV$7q8|C-mZ^-! zK+kHsgLSz{px3U=DZ$bEe?nFGq|9JRWt*mE3w!cLf^ofg=T0O0uU^F{s+6w_zXsdr z57I&h)%23^NX}VtTOjuZRZOLM;`m?m|Jp|{0Kss&X>K=)yjD#EGl{))5m#vf?S=wFIuMCFt@1Z=Clq zUIeUtB11RL@GnAgob=W|yYXtX7hrP1xYLlv{tF@K&iX4~%Bw~{gNdr{5ebrbve>xq zA!@g7&rv6a*t$E*`&krr65z@I2-8+M{E;1AXK8|ZfJuA@<{hWu@vwA^A|ak4o?@i^2hRT%H2UJ4SjgW&tXxnt9gfv7QNC7O7`~2sAB~ z5?2%t&b3GHlLDQ;3lHy)vff${nEPqrAk5F(^;0{ywR%8NVQ5U!X$r>kiJkJ(jOanp zt@0DMnAv=iDaKmvGr_7{O~crF9^^(hslWt#0R;%s#l%wePR`X-=D7~Z=U(2&)pqkn@2Q>7mLZ^V=Z?5V)pRDz?V!i+CQogr}`ccUA8#cUXRl~IBn$DNA;Pw4@ zw@Q!RY1}Ze@4E0G0h8sD+foAm8;q_VfY_9a;JsrZ&S>rPkZ5LFaU32sRtsIlOZ-aj zVKRRMwTBAFiuP#+_PQb9oVBOnapz}wKdeYA+J|Z%{Mm}B6+5u5aDi-(Mrd;vc9~N1 zouVtVO`3Ur=_VFD<#@+^sXH95VOAf2h>cb=#^FmhSRb16yVVXps9h4R>56zfz91v6 zdu5R-vpaiERK$ELMCd;d#wWZN|5TfxUg)j({c^0pfG_BBYaRz3{tNK?1=glLh`{%`RIC8nk+AZIs0@kWqPgt;@5e+)`sYcO zUjn%fNF&uhPVN3b3{!~Zya@?Re#B53=AN<)*glZK4cni>4 z3_Vu>6;NWo$)`tA=;Ay}D~_Z9WyO3nG>8q>{_QDK4gIfe_83E-boJzyGR*kygraG`-P|Icxt9;o zs8t5+Inr3wabL~#FvefZXE2p*^fE*_u zstDkiX!lV$g7h{<0Se`%LgwoW94b_t{0-UTYBlhfCBuVawM{yx@TH z9TPa0z^D5Wvy2UJ97+f%jf;QiL^ysh?{Zh) z!MQ9E`cY<5IW^h&sA597%p*~R?p2#HXWXec$P{Oqy;29PyVnzBd@6zLOW{j>$eIIT zd&XBK^mG-w%7!m|=$(|vPubEd*{>8$#~Po%r16Q>>j{qS5Q&lr%C+*ims9oiBWDf{ zTnSc=ooTna-fnGF`+v4Q2)msa&J57S2~esi&nep4>B{rVHeqKdD3k)Er_~UA^ADw* zSfSM+E5EX#D%5H&o zJTy!NSjqK!>JzQp_q6&b5E$%n)46=ILTJ|u58kpG!^%4$lZl_ zJBsLPe#2laLTf?G7C+yex_gS6n>?qmne!Ik&v+bo6RsMhqjLcwB5^RLz-?YC`bgV= zpF?tz(qEDh0Y_oS5I2;p2ShU^z1-GSj#3QQi%jwvJy%hlmD-s}yRNe~O&?BUIbXA3 zo?GGCDMn){WJ z%vhax11w;C9fTth6&QiI1KXd$p7a7ul{f#Z%73O8>6fD3idL-D$IkZlw5UYeoG$Bc zag0l+^u-MOlQ3s+|4QCMKKaJTojY~!2&dlbj9ato*A7kNY9vwOGGiPo*=3fel$mdN zfk_#=b8X;&Y~E0ML4p$3w~wJvCZ0cGB1#D*x2tAI_N%lIuZt)VXZJ;6(+#)~4^_H! zFlge|^0o7Q{(Qi(5JLaW=H)z+p@Qh4tq}DSfs_*YPEyaK?5mkf(kj4%yeEUX`GJjh zTGnjc1k1Ml79|QHM)fN+l4^Hs)9JwqN%V@#IT#Kt;Hok&Z-@|+W7Mb5@%OW|;qnXk z`Cx-hB#`VU&QLye+Ta@Z5FIQQ0#w__7spn}CGt290%>8-iG>Ut?^#jk1bhFT|~BV$z--#r}CKSH#mOHl57g?++7QW4YzU0rl9 zu}W^H(Jyhb=`vc~9Dx=S>SEQ2nojZlp>DC)`K4$7-ndjU0TQla@ub~$EvX)0rzCjz z2kB}q^hJ87WH1Wtc1;#lFjaQfow|=4A#{l1ww$k576JxHjF20n5GrCfn_;QKxKMg2 zl0X1p{m8R0TiyA;Ge(!-?wNe`PIxRV=J-u>D*?#Hf4?0NrQhS=_O-Qy+Bq#x>dfu; z=z}YRz9_2ut^^fPK|Ero)P4XZ$$zNfy?uV3`C0Cl!vsA-iJ5JK5V0qJI<}?7BfBIo ztDRlW;Wyri5mD95GiY`mp|?hiz-@^K!l<>w-jJr1$L!?NWqn--cMpHkiLnH`-Q~#> zh{;i$&M}kI%2P?7%nQ=}&m>S%5iY}03B7(*D5)>KH=rM6&aJyH;IZC!Xg_9$H%;_U zVKR>1WM~*YD2qtlQ6B;DrTC%B{GF6A;6Of~l;u{C^b?T4AqS1SmZ`(n>4LyTkBpjR zc~{B&t@!~&7Fd3T&1UnN$dRnX9~xld<;dYaskUBJzq9pVpKGkAARpQ*7CGx!~G6{7v9fC`;91#)uR%ba^Zo_!@T z2y2kk-P(Jn;r_M=AYJ=f;M8+%b!_|WY}43NOZ!E}*3+k2fgc}G>zvD)9LwT>@?MDS z-N>K#e!@;QRp8nEFm>#D>%oe_SvEevr%UtEYWX9ng7;tO6&WcHSoe6%=^)h~eej zpy^sMT3@X(M$rA^8bU9LwE&MhvrmK634^Y^HXDn85%YV~D?RrEHM-4(K8=WWxwr;i zK^iV0O?3wye0@kA%nuIzQ@eE+7@r5ZkX^R(@pN{0`WP&I0T!k^Xw)5E!9wi}V%c;|w1S@OWQ7q7C@o|B7-K$VU@+ zj_QaY0M=h^r&51YtB>8yQM<4#ndKb0YMY|5(SR=i+tomJm~{90)oU1mnLeO@Y31=b zcjy(N=?sQzUXknKS_Cc&7OoIG(?$TlsGsjwVnvxR3D4%eNh)6+59UAxXuMf(Uy^29 z_SW0xZL?#6XLGMz_fGHLx}@&(A1}S$wN>Q`JUM>73uC=|_2|1jmx#6t>=#^cD4GAzHt!OmjL|}XS?^p{ z*QCN#8^h)IO_T8QN-$stXvSE=y)+a0;_@1gIbP>!Iuc-Q z(~NiZG!YlA5?69{WQVMYxHw-(`^11 zyq#-DQ$WQV7YF!zUD3c(KklVK(wwV1;6v<0lm~kJ@aS>iK1thYaM%^rJCBZV)C)u0 z(VyReN7Zp?z_GaF*dq?YN*KC|SB5?T>m!A#CSqgd^_#_cqc5(0Fj@B{T zX(syhZeU6rpN2u~*LgN); z_Z|ck$}YN`V@u5D{o;M1& From 2b3f3d4b613ba9369381b0e2e94e04771ad67721 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 2 Aug 2024 04:42:57 +0000 Subject: [PATCH 49/83] Corrected sidebars.js --- website/sidebars.js | 1 + 1 file changed, 1 insertion(+) diff --git a/website/sidebars.js b/website/sidebars.js index 0ab11ee7ed..a26a38b338 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -52,6 +52,7 @@ module.exports = { "Explore Algorithms/OpenAI/OpenAI", "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding", "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN", + "Explore Algorithms/OpenAI/Quickstart - Fast Custom Embedding and KNN on GPU", "Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms", ], }, From b19a895a20a8c497ee7adb036409833b9aada4cb Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 6 Aug 2024 19:53:54 +0000 Subject: [PATCH 50/83] match web names --- website/sidebars.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/sidebars.js b/website/sidebars.js index a26a38b338..5ef56a0f78 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -52,7 +52,7 @@ module.exports = { "Explore Algorithms/OpenAI/OpenAI", "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding", "Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN", - "Explore Algorithms/OpenAI/Quickstart - Fast Custom Embedding and KNN on GPU", + "Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU", "Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms", ], }, From 050ba598efbcb35b900573c673c82d3efc50393a Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 23 Aug 2024 21:58:34 +0000 Subject: [PATCH 51/83] Add initial demo 3 --- ...ibuted Question - Answering with LLM.ipynb | 807 ++++++++++++++++++ 1 file changed, 807 insertions(+) create mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb new file mode 100644 index 0000000000..e7e20ad90f --- /dev/null +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb @@ -0,0 +1,807 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generatin using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", + "\n", + "We’ll cover the following key stages:\n", + "\n", + "1. Load PDF documents using PyMUPDF library.\n", + "2. Use SynapseML to split the documents into chunks.\n", + "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", + "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", + "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", + "\n", + "The demo was tested on NVIDIA A100 based databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 1: Define the notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "import fitz # PyMuPDF\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "from synapse.ml.featurize.text import PageSplitter\n", + "from pyspark.sql.functions import explode, col\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col, lit, explode\n", + "from pyspark.sql.types import ArrayType, FloatType\n", + "from pyspark.ml.functions import predict_batch_udf\n", + "from sentence_transformers import SentenceTransformer\n", + "import numpy as np\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.functions import concat_ws, collect_list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define the function to extract text from binary PDF data\n", + "def extract_text_from_binary_pdf(binary_content):\n", + " try:\n", + " # Create a PyMuPDF document from the binary data\n", + " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text\n", + " except Exception as e:\n", + " return str(e)\n", + "\n", + "# Register the function as a UDF\n", + "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", + "\n", + "# Apply the UDF to extract text from the binary content\n", + "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "# Add id column\n", + "\n", + "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\"path\", \"chunk\").withColumn(\"id\", monotonically_increasing_id())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define a function to create the encode_udf with a custom query_prefix\n", + "def create_encode_udf(query_prefix):\n", + " # Define a function to encode text in batches\n", + " # def encode_text_batch(texts):\n", + " def encode_text_batch():\n", + " # Load the model inside the function\n", + " model = SentenceTransformer('bzantium/NV-Embed-v1', trust_remote_code=True)\n", + " model.max_seq_length = 4096\n", + " model.tokenizer.padding_side = \"right\"\n", + " \n", + " def predict(inputs):\n", + " \n", + " output = model.encode(\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True)\n", + " return output\n", + "\n", + " return predict\n", + "\n", + " # # Encode the texts in batch\n", + " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", + " # return [embedding.tolist() for embedding in embeddings]\n", + "\n", + " # Define the predict_batch_udf with the above function\n", + " return predict_batch_udf(\n", + " encode_text_batch,\n", + " return_type=ArrayType(FloatType()),\n", + " batch_size=1\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Use it withhout query_prefix in this case\n", + "query_prefix = \"\"\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "# Applying the UDF to a DataFrame chunk column\n", + "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=2)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Compose a Question." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "task_name_to_instruct = {\"example\": \"Given a question, retrieve passages from the provided context that answer the question\",}\n", + "\n", + "query_prefix = \"Instruct: \"+task_name_to_instruct[\"example\"]+\"\\nQuery: \"\n", + "\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "# Define schema explicitly\n", + "schema = StructType([\n", + " StructField(\"id\", IntegerType(), True),\n", + " StructField(\"query\", StringType(), True)\n", + "])\n", + "\n", + "# Create DataFrame with id = 1 and the user query\n", + "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", + "\n", + "# Apply the UDF to generate the embeddings\n", + "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 7: Find chunks with the closest context to the question using embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings.select(\"id\", \"embeddings\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Add text to the results\n", + "result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", + "concatenated_text = result_df.agg(concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")).collect()[0][\"concatenated_text\"]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", + "\n", + "# Put model in global if we want to reuse it\n", + "global llm\n", + "\n", + "if 'llm' in globals() and llm is not None:\n", + " print(\"Model is already loaded.\")\n", + "else:\n", + " print(\"Model is not loaded.\")\n", + " \n", + " # Extend model input sizes\n", + " build_config = BuildConfig()\n", + " build_config.plugin_config.context_fmha = True\n", + " build_config.max_input_len = 5120\n", + " build_config.max_seq_len = 5632\n", + "\n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config) \n", + " \n", + "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", + "\n", + "context = concatenated_text\n", + "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "\n", + "prompt = f\"\"\"\n", + "context: {context}\n", + "Answer the question based only on the context above. Without multiple choices. If the\n", + "information to answer the question is not present in the given context then reply \"I don't know\".\n", + "My Question: {query}\n", + "What is your Answer? \"\"\"\n", + "\n", + "outputs = llm.generate(prompt, sampling_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 9: Print LLM results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7fb3394-289f-4949-835e-3520323a770d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Mocking the custom classes if they are not already defined\n", + "class CompletionOutput:\n", + " def __init__(self, index, text, token_ids, cumulative_logprob=None, logprobs=None):\n", + " self.index = index\n", + " self.text = text\n", + " self.token_ids = token_ids\n", + " self.cumulative_logprob = cumulative_logprob\n", + " self.logprobs = logprobs\n", + "\n", + "class RequestOutput:\n", + " def __init__(self, request_id, prompt, prompt_token_ids, outputs, finished):\n", + " self.request_id = request_id\n", + " self.prompt = prompt\n", + " self.prompt_token_ids = prompt_token_ids\n", + " self.outputs = outputs\n", + " self.finished = finished\n", + "\n", + "output_text = outputs.outputs[0].text\n", + "\n", + "# Split the text by '\\n'\n", + "split_text = output_text.split('\\n')\n", + "\n", + "for item in split_text:\n", + " if len(item) > 10:\n", + " # Split the item at the colon and take the part after it\n", + " result = item.split(':', 1)[-1].strip()\n", + " print(\"Answer: \" + result)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "QuickStart - Distributed Question - Answering with LLM", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From a0fabc000602ef45ccd4598c0fb52f761355a64b Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 23 Aug 2024 22:03:31 +0000 Subject: [PATCH 52/83] Changes for demo3 --- tools/init_scripts/init-rapidsml-cuda-11.8.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index bcb8fdc93e..24e7aaaa70 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -16,7 +16,7 @@ # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.4.0 +RAPIDS_VERSION=24.6.0 SPARK_RAPIDS_VERSION=23.10.0 SPARK_RAPIDSML_VERSION=24.6.0 @@ -46,3 +46,15 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda # install spark-rapids-ml /databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} + +# install TRT-LLM +/databricks/python/bin/pip install --upgrade cython +/databricks/python/bin/pip install --pre --no-build-isolation --extra-index-url https://pypi.nvidia.com mpi4py +# /databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm==0.12.0.dev2024073000 +/databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm + +# Required by NY-Embed +/databricks/python/bin/pip install --upgrade sentence-transformers +/databricks/python/bin/pip install transformers + +/databricks/python/bin/pip install PyMuPDF \ No newline at end of file From 164d8e5135515615c15594df1ca2642dc94c7b8f Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 23 Aug 2024 22:12:25 +0000 Subject: [PATCH 53/83] Corrections in demo description --- ...ckstart - Custom Embeddings and Approximate KNN on GPU.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index e979110a30..2d7ed67cb1 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -17,7 +17,7 @@ "source": [ "# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n", "\n", - "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", + "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb). Now encoding is processed by local embedders from Hugging Face and KNN is using GPU accelerated aproximate method using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml).\n", "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n" ] From 5dcd252217f6f7f2d75c7e5ca7640a3483733bc6 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 26 Aug 2024 19:48:54 +0000 Subject: [PATCH 54/83] added pdf comments --- tools/init_scripts/init-rapidsml-cuda-11.8.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index 24e7aaaa70..d3fe55d0fc 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -57,4 +57,5 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda /databricks/python/bin/pip install --upgrade sentence-transformers /databricks/python/bin/pip install transformers +# To work with PDF /databricks/python/bin/pip install PyMuPDF \ No newline at end of file From c59e262d13b1c6f4dbfdb3e9d7a9d557151421d0 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 26 Aug 2024 20:11:58 +0000 Subject: [PATCH 55/83] Added demo3 and demo2 text corrections --- ...tion - Answering with LLM on GPU (1).ipynb | 807 ++++++++++++++++++ ...mbeddings and Approximate KNN on GPU.ipynb | 7 +- 2 files changed, 811 insertions(+), 3 deletions(-) create mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb new file mode 100644 index 0000000000..43fd031a77 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb @@ -0,0 +1,807 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", + "\n", + "We’ll cover the following key stages:\n", + "\n", + "1. Load PDF documents using PyMUPDF library.\n", + "2. Use SynapseML to split the documents into chunks.\n", + "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", + "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", + "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", + "\n", + "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 1: Define the notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "import fitz # PyMuPDF\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "from synapse.ml.featurize.text import PageSplitter\n", + "from pyspark.sql.functions import explode, col\n", + "from pyspark.sql.functions import monotonically_increasing_id\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col, lit, explode\n", + "from pyspark.sql.types import ArrayType, FloatType\n", + "from pyspark.ml.functions import predict_batch_udf\n", + "from sentence_transformers import SentenceTransformer\n", + "import numpy as np\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.functions import concat_ws, collect_list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define the function to extract text from binary PDF data\n", + "def extract_text_from_binary_pdf(binary_content):\n", + " try:\n", + " # Create a PyMuPDF document from the binary data\n", + " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text\n", + " except Exception as e:\n", + " return str(e)\n", + "\n", + "# Register the function as a UDF\n", + "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", + "\n", + "# Apply the UDF to extract text from the binary content\n", + "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "# Add id column\n", + "\n", + "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\"path\", \"chunk\").withColumn(\"id\", monotonically_increasing_id())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define a function to create the encode_udf with a custom query_prefix\n", + "def create_encode_udf(query_prefix):\n", + " # Define a function to encode text in batches\n", + " # def encode_text_batch(texts):\n", + " def encode_text_batch():\n", + " # Load the model inside the function\n", + " model = SentenceTransformer('bzantium/NV-Embed-v1', trust_remote_code=True)\n", + " model.max_seq_length = 4096\n", + " model.tokenizer.padding_side = \"right\"\n", + " \n", + " def predict(inputs):\n", + " \n", + " output = model.encode(\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True)\n", + " return output\n", + "\n", + " return predict\n", + "\n", + " # # Encode the texts in batch\n", + " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", + " # return [embedding.tolist() for embedding in embeddings]\n", + "\n", + " # Define the predict_batch_udf with the above function\n", + " return predict_batch_udf(\n", + " encode_text_batch,\n", + " return_type=ArrayType(FloatType()),\n", + " batch_size=1\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Use it withhout query_prefix in this case\n", + "query_prefix = \"\"\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "# Applying the UDF to a DataFrame chunk column\n", + "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=2)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Compose a Question." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "task_name_to_instruct = {\"example\": \"Given a question, retrieve passages from the provided context that answer the question\",}\n", + "\n", + "query_prefix = \"Instruct: \"+task_name_to_instruct[\"example\"]+\"\\nQuery: \"\n", + "\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "# Define schema explicitly\n", + "schema = StructType([\n", + " StructField(\"id\", IntegerType(), True),\n", + " StructField(\"query\", StringType(), True)\n", + "])\n", + "\n", + "# Create DataFrame with id = 1 and the user query\n", + "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", + "\n", + "# Apply the UDF to generate the embeddings\n", + "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 7: Find chunks with the closest context to the question using embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings.select(\"id\", \"embeddings\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Add text to the results\n", + "result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", + "concatenated_text = result_df.agg(concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")).collect()[0][\"concatenated_text\"]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", + "\n", + "# Put model in global if we want to reuse it\n", + "global llm\n", + "\n", + "if 'llm' in globals() and llm is not None:\n", + " print(\"Model is already loaded.\")\n", + "else:\n", + " print(\"Model is not loaded.\")\n", + " \n", + " # Extend model input sizes\n", + " build_config = BuildConfig()\n", + " build_config.plugin_config.context_fmha = True\n", + " build_config.max_input_len = 5120\n", + " build_config.max_seq_len = 5632\n", + "\n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config) \n", + " \n", + "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", + "\n", + "context = concatenated_text\n", + "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "\n", + "prompt = f\"\"\"\n", + "context: {context}\n", + "Answer the question based only on the context above. Without multiple choices. If the\n", + "information to answer the question is not present in the given context then reply \"I don't know\".\n", + "My Question: {query}\n", + "What is your Answer? \"\"\"\n", + "\n", + "outputs = llm.generate(prompt, sampling_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 9: Print LLM results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7fb3394-289f-4949-835e-3520323a770d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Mocking the custom classes if they are not already defined\n", + "class CompletionOutput:\n", + " def __init__(self, index, text, token_ids, cumulative_logprob=None, logprobs=None):\n", + " self.index = index\n", + " self.text = text\n", + " self.token_ids = token_ids\n", + " self.cumulative_logprob = cumulative_logprob\n", + " self.logprobs = logprobs\n", + "\n", + "class RequestOutput:\n", + " def __init__(self, request_id, prompt, prompt_token_ids, outputs, finished):\n", + " self.request_id = request_id\n", + " self.prompt = prompt\n", + " self.prompt_token_ids = prompt_token_ids\n", + " self.outputs = outputs\n", + " self.finished = finished\n", + "\n", + "output_text = outputs.outputs[0].text\n", + "\n", + "# Split the text by '\\n'\n", + "split_text = output_text.split('\\n')\n", + "\n", + "for item in split_text:\n", + " if len(item) > 10:\n", + " # Split the item at the colon and take the part after it\n", + " result = item.split(':', 1)[-1].strip()\n", + " print(\"Answer: \" + result)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU (1)", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index 2fbcb3ee7a..1d499027d5 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -12,9 +12,10 @@ } }, "source": [ - "# Embedding with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n", + "# Embedding with local (per node) NVIDIA TensorRT accelerator and GPU based Approximate Nearest Neighbor (ANN)\n", "\n", - "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", + "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb). Now encoding is processed by local embedders from Hugging Face and KNN is using GPU accelerated approximate method using IVFFlat algorithm. All tutorial stages accelerated by NVIDIA GPU using NVIDIA TensorRT and Spark Rapids ML.\n", + " All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previous CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n" ] @@ -439,7 +440,7 @@ "\n", "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", "\n", - "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", + "This is the comparison duration results on 10 T4 GPU nodes for both approaches:\n", "\n", "![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png)\n", "\n", From 5b3a0a9d6f4013e24faf37c90457396153bcec6c Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:40:06 -0700 Subject: [PATCH 56/83] Delete docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb --- ...ibuted Question - Answering with LLM.ipynb | 807 ------------------ 1 file changed, 807 deletions(-) delete mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb deleted file mode 100644 index e7e20ad90f..0000000000 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM.ipynb +++ /dev/null @@ -1,807 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Introduction\n", - "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generatin using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", - "\n", - "We’ll cover the following key stages:\n", - "\n", - "1. Load PDF documents using PyMUPDF library.\n", - "2. Use SynapseML to split the documents into chunks.\n", - "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", - "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", - "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", - "\n", - "The demo was tested on NVIDIA A100 based databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 1: Define the notebook environment" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "import fitz # PyMuPDF\n", - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "from synapse.ml.featurize.text import PageSplitter\n", - "from pyspark.sql.functions import explode, col\n", - "from pyspark.sql.functions import monotonically_increasing_id\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import col, lit, explode\n", - "from pyspark.sql.types import ArrayType, FloatType\n", - "from pyspark.ml.functions import predict_batch_udf\n", - "from sentence_transformers import SentenceTransformer\n", - "import numpy as np\n", - "from spark_rapids_ml.knn import (\n", - " ApproximateNearestNeighbors,\n", - " ApproximateNearestNeighborsModel,\n", - ")\n", - "import pyspark.sql.functions as F\n", - "from pyspark.sql.functions import concat_ws, collect_list" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 2: Load the documents into a Spark DataFrame." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", - "df = spark.read.format(\"binaryFile\").load(document_path).cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define the function to extract text from binary PDF data\n", - "def extract_text_from_binary_pdf(binary_content):\n", - " try:\n", - " # Create a PyMuPDF document from the binary data\n", - " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", - " text = \"\"\n", - " for page in doc:\n", - " text += page.get_text()\n", - " return text\n", - " except Exception as e:\n", - " return str(e)\n", - "\n", - "# Register the function as a UDF\n", - "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", - "\n", - "# Apply the UDF to extract text from the binary content\n", - "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d26e4217-ac87-4583-9500-af65d969c199", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "ps = (\n", - " PageSplitter()\n", - " .setInputCol(\"output_content\")\n", - " .setMaximumPageLength(4000)\n", - " .setMinimumPageLength(3000)\n", - " .setOutputCol(\"chunks\")\n", - ")\n", - "\n", - "splitted_df = ps.transform(analyzed_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Each column contains many chunks for the same document as a vector.\n", - "# Explode will distribute and replicate the content of a vecor across multple rows\n", - "# Add id column\n", - "\n", - "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\"path\", \"chunk\").withColumn(\"id\", monotonically_increasing_id())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 4: Generate Embeddings." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define a function to create the encode_udf with a custom query_prefix\n", - "def create_encode_udf(query_prefix):\n", - " # Define a function to encode text in batches\n", - " # def encode_text_batch(texts):\n", - " def encode_text_batch():\n", - " # Load the model inside the function\n", - " model = SentenceTransformer('bzantium/NV-Embed-v1', trust_remote_code=True)\n", - " model.max_seq_length = 4096\n", - " model.tokenizer.padding_side = \"right\"\n", - " \n", - " def predict(inputs):\n", - " \n", - " output = model.encode(\n", - " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True)\n", - " return output\n", - "\n", - " return predict\n", - "\n", - " # # Encode the texts in batch\n", - " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", - " # return [embedding.tolist() for embedding in embeddings]\n", - "\n", - " # Define the predict_batch_udf with the above function\n", - " return predict_batch_udf(\n", - " encode_text_batch,\n", - " return_type=ArrayType(FloatType()),\n", - " batch_size=1\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Use it withhout query_prefix in this case\n", - "query_prefix = \"\"\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "# Applying the UDF to a DataFrame chunk column\n", - "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "rapids_knn_model = (\n", - " ApproximateNearestNeighbors(k=2)\n", - " .setInputCol(\"embeddings\")\n", - " .setIdCol(\"id\")\n", - " .fit(embeddings)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "17b3890f-4163-443c-929b-252d62a6c736", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 6: Compose a Question." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", - "\n", - "task_name_to_instruct = {\"example\": \"Given a question, retrieve passages from the provided context that answer the question\",}\n", - "\n", - "query_prefix = \"Instruct: \"+task_name_to_instruct[\"example\"]+\"\\nQuery: \"\n", - "\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "# Define schema explicitly\n", - "schema = StructType([\n", - " StructField(\"id\", IntegerType(), True),\n", - " StructField(\"query\", StringType(), True)\n", - "])\n", - "\n", - "# Create DataFrame with id = 1 and the user query\n", - "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", - "\n", - "# Apply the UDF to generate the embeddings\n", - "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 7: Find chunks with the closest context to the question using embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings.select(\"id\", \"embeddings\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Add text to the results\n", - "result_df = (\n", - " knn_df.withColumn(\n", - " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", - " )\n", - " .select(\n", - " F.col(\"query_id\"),\n", - " F.col(\"zipped.indices\").alias(\"id\"),\n", - " F.col(\"zipped.distances\").alias(\"distance\"),\n", - " )\n", - " .join(embeddings, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", - "concatenated_text = result_df.agg(concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")).collect()[0][\"concatenated_text\"]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", - "\n", - "# Put model in global if we want to reuse it\n", - "global llm\n", - "\n", - "if 'llm' in globals() and llm is not None:\n", - " print(\"Model is already loaded.\")\n", - "else:\n", - " print(\"Model is not loaded.\")\n", - " \n", - " # Extend model input sizes\n", - " build_config = BuildConfig()\n", - " build_config.plugin_config.context_fmha = True\n", - " build_config.max_input_len = 5120\n", - " build_config.max_seq_len = 5632\n", - "\n", - " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config) \n", - " \n", - "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", - "\n", - "context = concatenated_text\n", - "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "\n", - "prompt = f\"\"\"\n", - "context: {context}\n", - "Answer the question based only on the context above. Without multiple choices. If the\n", - "information to answer the question is not present in the given context then reply \"I don't know\".\n", - "My Question: {query}\n", - "What is your Answer? \"\"\"\n", - "\n", - "outputs = llm.generate(prompt, sampling_params)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 9: Print LLM results" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b7fb3394-289f-4949-835e-3520323a770d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Mocking the custom classes if they are not already defined\n", - "class CompletionOutput:\n", - " def __init__(self, index, text, token_ids, cumulative_logprob=None, logprobs=None):\n", - " self.index = index\n", - " self.text = text\n", - " self.token_ids = token_ids\n", - " self.cumulative_logprob = cumulative_logprob\n", - " self.logprobs = logprobs\n", - "\n", - "class RequestOutput:\n", - " def __init__(self, request_id, prompt, prompt_token_ids, outputs, finished):\n", - " self.request_id = request_id\n", - " self.prompt = prompt\n", - " self.prompt_token_ids = prompt_token_ids\n", - " self.outputs = outputs\n", - " self.finished = finished\n", - "\n", - "output_text = outputs.outputs[0].text\n", - "\n", - "# Split the text by '\\n'\n", - "split_text = output_text.split('\\n')\n", - "\n", - "for item in split_text:\n", - " if len(item) > 10:\n", - " # Split the item at the colon and take the part after it\n", - " result = item.split(':', 1)[-1].strip()\n", - " print(\"Answer: \" + result)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "environmentMetadata": { - "base_environment": "", - "client": "1" - }, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "QuickStart - Distributed Question - Answering with LLM", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 63096a944ce41f57035740dc81f64b6a85418fad Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 26 Aug 2024 20:43:22 +0000 Subject: [PATCH 57/83] Demo 3 File rename --- ...rt - Distributed Question - Answering with LLM on GPU.ipynb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/Explore Algorithms/AI Services/{QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb => QuickStart - Distributed Question - Answering with LLM on GPU.ipynb} (99%) diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb similarity index 99% rename from docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb rename to docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb index 43fd031a77..3c802b2360 100644 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU (1).ipynb +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb @@ -795,7 +795,7 @@ "notebookMetadata": { "pythonIndentUnit": 4 }, - "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU (1)", + "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", "widgets": {} }, "language_info": { From 81b0705386edd51300e7d768caff9a2e7dee01fc Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Mon, 26 Aug 2024 23:11:08 +0000 Subject: [PATCH 58/83] Clean imports --- ...Question - Answering with LLM on GPU.ipynb | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb index 3c802b2360..b518a64cf6 100644 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb @@ -61,7 +61,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", "showTitle": false, @@ -77,7 +80,10 @@ "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", "showTitle": false, @@ -86,26 +92,17 @@ }, "outputs": [], "source": [ - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "import fitz # PyMuPDF\n", - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "from synapse.ml.featurize.text import PageSplitter\n", - "from pyspark.sql.functions import explode, col\n", - "from pyspark.sql.functions import monotonically_increasing_id\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import col, lit, explode\n", - "from pyspark.sql.types import ArrayType, FloatType\n", + "import fitz \n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.types import ArrayType, FloatType, StringType\n", + "from pyspark.sql.functions import explode, col, monotonically_increasing_id, concat_ws, collect_list\n", "from pyspark.ml.functions import predict_batch_udf\n", "from sentence_transformers import SentenceTransformer\n", - "import numpy as np\n", + "from synapse.ml.featurize.text import PageSplitter\n", "from spark_rapids_ml.knn import (\n", " ApproximateNearestNeighbors,\n", " ApproximateNearestNeighborsModel,\n", - ")\n", - "import pyspark.sql.functions as F\n", - "from pyspark.sql.functions import concat_ws, collect_list" + ")" ] }, { @@ -444,7 +441,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", "showTitle": false, @@ -546,7 +546,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", "showTitle": false, @@ -707,7 +710,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", "showTitle": false, From 50df271ba6bd8b74d930228afd60309627ed545d Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 27 Aug 2024 03:28:22 +0000 Subject: [PATCH 59/83] style changes --- ...Question - Answering with LLM on GPU.ipynb | 94 +++++++++---------- 1 file changed, 44 insertions(+), 50 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb index b518a64cf6..7ae41e1949 100644 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb @@ -92,10 +92,16 @@ }, "outputs": [], "source": [ - "import fitz \n", + "import fitz\n", "import pyspark.sql.functions as F\n", "from pyspark.sql.types import ArrayType, FloatType, StringType\n", - "from pyspark.sql.functions import explode, col, monotonically_increasing_id, concat_ws, collect_list\n", + "from pyspark.sql.functions import (\n", + " explode,\n", + " col,\n", + " monotonically_increasing_id,\n", + " concat_ws,\n", + " collect_list,\n", + ")\n", "from pyspark.ml.functions import predict_batch_udf\n", "from sentence_transformers import SentenceTransformer\n", "from synapse.ml.featurize.text import PageSplitter\n", @@ -245,6 +251,7 @@ "# Register the function as a UDF\n", "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", "\n", + "\n", "# Apply the UDF to extract text from the binary content\n", "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" ] @@ -388,14 +395,15 @@ " # def encode_text_batch(texts):\n", " def encode_text_batch():\n", " # Load the model inside the function\n", - " model = SentenceTransformer('bzantium/NV-Embed-v1', trust_remote_code=True)\n", + " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", " model.max_seq_length = 4096\n", " model.tokenizer.padding_side = \"right\"\n", - " \n", + "\n", " def predict(inputs):\n", - " \n", + "\n", " output = model.encode(\n", - " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True)\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", + " )\n", " return output\n", "\n", " return predict\n", @@ -406,9 +414,7 @@ "\n", " # Define the predict_batch_udf with the above function\n", " return predict_batch_udf(\n", - " encode_text_batch,\n", - " return_type=ArrayType(FloatType()),\n", - " batch_size=1\n", + " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", " )" ] }, @@ -522,18 +528,19 @@ "source": [ "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", "\n", - "task_name_to_instruct = {\"example\": \"Given a question, retrieve passages from the provided context that answer the question\",}\n", + "task_name_to_instruct = {\n", + " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", + "}\n", "\n", - "query_prefix = \"Instruct: \"+task_name_to_instruct[\"example\"]+\"\\nQuery: \"\n", + "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", "\n", "encode_udf = create_encode_udf(query_prefix)\n", "\n", "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", "# Define schema explicitly\n", - "schema = StructType([\n", - " StructField(\"id\", IntegerType(), True),\n", - " StructField(\"query\", StringType(), True)\n", - "])\n", + "schema = StructType(\n", + " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", + ")\n", "\n", "# Create DataFrame with id = 1 and the user query\n", "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", @@ -577,7 +584,9 @@ }, "outputs": [], "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(query_embeddings.select(\"id\", \"embeddings\"))" + "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", + " query_embeddings.select(\"id\", \"embeddings\")\n", + ")" ] }, { @@ -599,17 +608,17 @@ "source": [ "# Add text to the results\n", "result_df = (\n", - " knn_df.withColumn(\n", - " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", - " )\n", - " .select(\n", - " F.col(\"query_id\"),\n", - " F.col(\"zipped.indices\").alias(\"id\"),\n", - " F.col(\"zipped.distances\").alias(\"distance\"),\n", - " )\n", - " .join(embeddings, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", - " )" + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + ")" ] }, { @@ -630,7 +639,9 @@ "outputs": [], "source": [ "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", - "concatenated_text = result_df.agg(concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")).collect()[0][\"concatenated_text\"]\n" + "concatenated_text = result_df.agg(\n", + " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", + ").collect()[0][\"concatenated_text\"]" ] }, { @@ -678,7 +689,7 @@ "# Put model in global if we want to reuse it\n", "global llm\n", "\n", - "if 'llm' in globals() and llm is not None:\n", + "if \"llm\" in globals() and llm is not None:\n", " print(\"Model is already loaded.\")\n", "else:\n", " print(\"Model is not loaded.\")\n", @@ -689,8 +700,8 @@ " build_config.max_input_len = 5120\n", " build_config.max_seq_len = 5632\n", "\n", - " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config) \n", - " \n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", + "\n", "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", "\n", "context = concatenated_text\n", @@ -741,32 +752,15 @@ }, "outputs": [], "source": [ - "# Mocking the custom classes if they are not already defined\n", - "class CompletionOutput:\n", - " def __init__(self, index, text, token_ids, cumulative_logprob=None, logprobs=None):\n", - " self.index = index\n", - " self.text = text\n", - " self.token_ids = token_ids\n", - " self.cumulative_logprob = cumulative_logprob\n", - " self.logprobs = logprobs\n", - "\n", - "class RequestOutput:\n", - " def __init__(self, request_id, prompt, prompt_token_ids, outputs, finished):\n", - " self.request_id = request_id\n", - " self.prompt = prompt\n", - " self.prompt_token_ids = prompt_token_ids\n", - " self.outputs = outputs\n", - " self.finished = finished\n", - "\n", "output_text = outputs.outputs[0].text\n", "\n", "# Split the text by '\\n'\n", - "split_text = output_text.split('\\n')\n", + "split_text = output_text.split(\"\\n\")\n", "\n", "for item in split_text:\n", " if len(item) > 10:\n", " # Split the item at the colon and take the part after it\n", - " result = item.split(':', 1)[-1].strip()\n", + " result = item.split(\":\", 1)[-1].strip()\n", " print(\"Answer: \" + result)\n", " break" ] From b3820dc6e670d89837719c4fd7d304f0f9a0e65e Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 27 Aug 2024 03:55:29 +0000 Subject: [PATCH 60/83] More style changes --- ...ributed Question - Answering with LLM on GPU.ipynb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb index 7ae41e1949..bb9575d250 100644 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb @@ -248,6 +248,7 @@ " except Exception as e:\n", " return str(e)\n", "\n", + "\n", "# Register the function as a UDF\n", "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", "\n", @@ -328,7 +329,11 @@ "# Explode will distribute and replicate the content of a vecor across multple rows\n", "# Add id column\n", "\n", - "exploded_df = splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\")).select(\"path\", \"chunk\").withColumn(\"id\", monotonically_increasing_id())" + "exploded_df = (\n", + " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", + " .select(\"path\", \"chunk\")\n", + " .withColumn(\"id\", monotonically_increasing_id())\n", + ")" ] }, { @@ -610,7 +615,7 @@ "result_df = (\n", " knn_df.withColumn(\n", " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", - " )\n", + " )\n", " .select(\n", " F.col(\"query_id\"),\n", " F.col(\"zipped.indices\").alias(\"id\"),\n", @@ -693,7 +698,7 @@ " print(\"Model is already loaded.\")\n", "else:\n", " print(\"Model is not loaded.\")\n", - " \n", + "\n", " # Extend model input sizes\n", " build_config = BuildConfig()\n", " build_config.plugin_config.context_fmha = True\n", From 5cda4632d55ac9769b67845500dd2dbc1a2964b2 Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Tue, 27 Aug 2024 05:49:41 +0000 Subject: [PATCH 61/83] Added to the web --- website/sidebars.js | 1 + 1 file changed, 1 insertion(+) diff --git a/website/sidebars.js b/website/sidebars.js index 5ef56a0f78..fec9339954 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -40,6 +40,7 @@ module.exports = { "Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine", "Explore Algorithms/AI Services/Quickstart - Create Audiobooks", "Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs", + "Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU", "Explore Algorithms/AI Services/Quickstart - Flooding Risk", "Explore Algorithms/AI Services/Quickstart - Predictive Maintenance", ], From 8b72f0da8af6cee59276c0f191bc6e969382c8bb Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:18:10 -0700 Subject: [PATCH 62/83] Initial Demo 3 (RAG on GPU) --- ...Question - Answering with LLM on GPU.ipynb | 812 ++++++++++++++++++ 1 file changed, 812 insertions(+) create mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb new file mode 100644 index 0000000000..bb9575d250 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", + "\n", + "We’ll cover the following key stages:\n", + "\n", + "1. Load PDF documents using PyMUPDF library.\n", + "2. Use SynapseML to split the documents into chunks.\n", + "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", + "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", + "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", + "\n", + "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 1: Define the notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import fitz\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.types import ArrayType, FloatType, StringType\n", + "from pyspark.sql.functions import (\n", + " explode,\n", + " col,\n", + " monotonically_increasing_id,\n", + " concat_ws,\n", + " collect_list,\n", + ")\n", + "from pyspark.ml.functions import predict_batch_udf\n", + "from sentence_transformers import SentenceTransformer\n", + "from synapse.ml.featurize.text import PageSplitter\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define the function to extract text from binary PDF data\n", + "def extract_text_from_binary_pdf(binary_content):\n", + " try:\n", + " # Create a PyMuPDF document from the binary data\n", + " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text\n", + " except Exception as e:\n", + " return str(e)\n", + "\n", + "\n", + "# Register the function as a UDF\n", + "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", + "\n", + "\n", + "# Apply the UDF to extract text from the binary content\n", + "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "# Add id column\n", + "\n", + "exploded_df = (\n", + " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", + " .select(\"path\", \"chunk\")\n", + " .withColumn(\"id\", monotonically_increasing_id())\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define a function to create the encode_udf with a custom query_prefix\n", + "def create_encode_udf(query_prefix):\n", + " # Define a function to encode text in batches\n", + " # def encode_text_batch(texts):\n", + " def encode_text_batch():\n", + " # Load the model inside the function\n", + " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", + " model.max_seq_length = 4096\n", + " model.tokenizer.padding_side = \"right\"\n", + "\n", + " def predict(inputs):\n", + "\n", + " output = model.encode(\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", + " )\n", + " return output\n", + "\n", + " return predict\n", + "\n", + " # # Encode the texts in batch\n", + " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", + " # return [embedding.tolist() for embedding in embeddings]\n", + "\n", + " # Define the predict_batch_udf with the above function\n", + " return predict_batch_udf(\n", + " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Use it withhout query_prefix in this case\n", + "query_prefix = \"\"\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "# Applying the UDF to a DataFrame chunk column\n", + "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=2)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Compose a Question." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "task_name_to_instruct = {\n", + " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", + "}\n", + "\n", + "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", + "\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "# Define schema explicitly\n", + "schema = StructType(\n", + " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", + ")\n", + "\n", + "# Create DataFrame with id = 1 and the user query\n", + "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", + "\n", + "# Apply the UDF to generate the embeddings\n", + "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 7: Find chunks with the closest context to the question using embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", + " query_embeddings.select(\"id\", \"embeddings\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Add text to the results\n", + "result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", + "concatenated_text = result_df.agg(\n", + " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", + ").collect()[0][\"concatenated_text\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", + "\n", + "# Put model in global if we want to reuse it\n", + "global llm\n", + "\n", + "if \"llm\" in globals() and llm is not None:\n", + " print(\"Model is already loaded.\")\n", + "else:\n", + " print(\"Model is not loaded.\")\n", + "\n", + " # Extend model input sizes\n", + " build_config = BuildConfig()\n", + " build_config.plugin_config.context_fmha = True\n", + " build_config.max_input_len = 5120\n", + " build_config.max_seq_len = 5632\n", + "\n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", + "\n", + "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", + "\n", + "context = concatenated_text\n", + "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "\n", + "prompt = f\"\"\"\n", + "context: {context}\n", + "Answer the question based only on the context above. Without multiple choices. If the\n", + "information to answer the question is not present in the given context then reply \"I don't know\".\n", + "My Question: {query}\n", + "What is your Answer? \"\"\"\n", + "\n", + "outputs = llm.generate(prompt, sampling_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 9: Print LLM results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7fb3394-289f-4949-835e-3520323a770d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "output_text = outputs.outputs[0].text\n", + "\n", + "# Split the text by '\\n'\n", + "split_text = output_text.split(\"\\n\")\n", + "\n", + "for item in split_text:\n", + " if len(item) > 10:\n", + " # Split the item at the colon and take the part after it\n", + " result = item.split(\":\", 1)[-1].strip()\n", + " print(\"Answer: \" + result)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c2e0646c48ef49a22de6c56fe3737195577b951a Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:23:00 -0700 Subject: [PATCH 63/83] Update init-rapidsml-cuda-11.8.sh Added demo3 changes --- tools/init_scripts/init-rapidsml-cuda-11.8.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/init_scripts/init-rapidsml-cuda-11.8.sh b/tools/init_scripts/init-rapidsml-cuda-11.8.sh index bcb8fdc93e..f8dd710ce1 100644 --- a/tools/init_scripts/init-rapidsml-cuda-11.8.sh +++ b/tools/init_scripts/init-rapidsml-cuda-11.8.sh @@ -16,7 +16,7 @@ # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=24.4.0 +RAPIDS_VERSION=24.6.0 SPARK_RAPIDS_VERSION=23.10.0 SPARK_RAPIDSML_VERSION=24.6.0 @@ -46,3 +46,16 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda # install spark-rapids-ml /databricks/python/bin/pip install spark-rapids-ml~=${SPARK_RAPIDSML_VERSION} + +# install TRT-LLM +/databricks/python/bin/pip install --upgrade cython +/databricks/python/bin/pip install --pre --no-build-isolation --extra-index-url https://pypi.nvidia.com mpi4py +# /databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm==0.12.0.dev2024073000 +/databricks/python/bin/pip install --pre --extra-index-url https://pypi.nvidia.com tensorrt-llm + +# Required by NY-Embed +/databricks/python/bin/pip install --upgrade sentence-transformers +/databricks/python/bin/pip install transformers + +# To work with PDF +/databricks/python/bin/pip install PyMuPDF From c17be8f9e6ada4a4b58685e6d581493a363ad5f9 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:25:44 -0700 Subject: [PATCH 64/83] Update sidebars.js Adde demo3 (QuickStart - Distributed Question - Answering with LLM on GPU) --- website/sidebars.js | 1 + 1 file changed, 1 insertion(+) diff --git a/website/sidebars.js b/website/sidebars.js index 5ef56a0f78..fec9339954 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -40,6 +40,7 @@ module.exports = { "Explore Algorithms/AI Services/Quickstart - Create a Visual Search Engine", "Explore Algorithms/AI Services/Quickstart - Create Audiobooks", "Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs", + "Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU", "Explore Algorithms/AI Services/Quickstart - Flooding Risk", "Explore Algorithms/AI Services/Quickstart - Predictive Maintenance", ], From 2f408f4ee8b1714af15352e76d5b88e239cb7fed Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:33:18 -0700 Subject: [PATCH 65/83] change text in Embeddings and Approximate KNN on GPU.ipynb --- ...rt - Custom Embeddings and Approximate KNN on GPU.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb index e979110a30..ec3fb37b53 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Custom Embeddings and Approximate KNN on GPU.ipynb @@ -15,9 +15,10 @@ } }, "source": [ - "# Embedding Text with local (per node) NVIDIA TensorRT accelerator and GPU based Aproximate Nearest Neighbor (ANN)\n", + "# Embedding with local (per node) NVIDIA TensorRT accelerator and GPU based Approximate Nearest Neighbor (ANN)\n", "\n", - "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb) when encoding is processed by OpenAI requests and KNN was using GPU based brute force search. This tutorial shows how to perform fast local embeddings using [multilingual E5 text embeddings](https://arxiv.org/abs/2402.05672) and fast aproximate Nearest Neighbor search using IVFFlat alcorithm. All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previos CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", + "The demo extending existing [Azure OpenAI based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding%20and%20GPU%20based%20KNN.ipynb). Now encoding is processed by local embedders from Hugging Face and KNN is using GPU accelerated approximate method using IVFFlat algorithm. All tutorial stages accelerated by NVIDIA GPU using NVIDIA TensorRT and Spark Rapids ML.\n", + " All tutorial stages accelerated by NVIDIA GPU using [NVIDIA TensorRT](https://developer.nvidia.com/tensorrt) and [Spark Rapids ML](https://github.com/NVIDIA/spark-rapids-ml). The tutorial folder contains two benchmark notebooks to demonstrate advantages of the presented GPU based approach compare to [previous CPU based demo](https://github.com/microsoft/SynapseML/blob/master/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding.ipynb)\n", "\n", "The key prerequisites for this quickstart include a working Azure OpenAI resource, and an Apache Spark cluster with SynapseML installed. We suggest creating a Synapse workspace, but currently the notebook was running on Databricks GPU based cluster using Standard_NC24ads_A100_v4 with 6 workers. Databricks Runtime was 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) with related [init_script](https://github.com/microsoft/SynapseML/tree/master/tools/init_scripts) to install all required packages.\n" ] @@ -466,7 +467,7 @@ "\n", "The goal of this demo is to showcase two acceleration techniques: local (per node) embedding generation and approximate KNN. Compared to the original method, which relies on HTTP requests to the OpenAI model and CPU-based KNN. The new approach is significantly more scalable and provides substantial acceleration, especially for large input datasets.\n", "\n", - "This is the comparison dureation results on 10 T4 GPU nodes for both approaches:\n", + "This is the comparison duration results on 10 T4 GPU nodes for both approaches:\n", "\n", "![KNN Comparison](https://mmlspark.blob.core.windows.net/graphics/Documentation/knn_comparison.png)\n", "\n", From 3d13751996b33e1f9f76bca42b7139551db47fea Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:20:19 -0700 Subject: [PATCH 66/83] Demo3 with correct name --- ...Question - Answering with LLM on GPU.ipynb | 812 ++++++++++++++++++ 1 file changed, 812 insertions(+) create mode 100644 docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb new file mode 100644 index 0000000000..bb9575d250 --- /dev/null +++ b/docs/Explore Algorithms/AI Services/Quickstart - Distributed Question - Answering with LLM on GPU.ipynb @@ -0,0 +1,812 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Introduction\n", + "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", + "\n", + "We’ll cover the following key stages:\n", + "\n", + "1. Load PDF documents using PyMUPDF library.\n", + "2. Use SynapseML to split the documents into chunks.\n", + "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", + "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", + "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", + "\n", + "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 1: Define the notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import fitz\n", + "import pyspark.sql.functions as F\n", + "from pyspark.sql.types import ArrayType, FloatType, StringType\n", + "from pyspark.sql.functions import (\n", + " explode,\n", + " col,\n", + " monotonically_increasing_id,\n", + " concat_ws,\n", + " collect_list,\n", + ")\n", + "from pyspark.ml.functions import predict_batch_udf\n", + "from sentence_transformers import SentenceTransformer\n", + "from synapse.ml.featurize.text import PageSplitter\n", + "from spark_rapids_ml.knn import (\n", + " ApproximateNearestNeighbors,\n", + " ApproximateNearestNeighborsModel,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 2: Load the documents into a Spark DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", + "df = spark.read.format(\"binaryFile\").load(document_path).cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define the function to extract text from binary PDF data\n", + "def extract_text_from_binary_pdf(binary_content):\n", + " try:\n", + " # Create a PyMuPDF document from the binary data\n", + " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text\n", + " except Exception as e:\n", + " return str(e)\n", + "\n", + "\n", + "# Register the function as a UDF\n", + "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", + "\n", + "\n", + "# Apply the UDF to extract text from the binary content\n", + "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d26e4217-ac87-4583-9500-af65d969c199", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ps = (\n", + " PageSplitter()\n", + " .setInputCol(\"output_content\")\n", + " .setMaximumPageLength(4000)\n", + " .setMinimumPageLength(3000)\n", + " .setOutputCol(\"chunks\")\n", + ")\n", + "\n", + "splitted_df = ps.transform(analyzed_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Each column contains many chunks for the same document as a vector.\n", + "# Explode will distribute and replicate the content of a vecor across multple rows\n", + "# Add id column\n", + "\n", + "exploded_df = (\n", + " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", + " .select(\"path\", \"chunk\")\n", + " .withColumn(\"id\", monotonically_increasing_id())\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 4: Generate Embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Define a function to create the encode_udf with a custom query_prefix\n", + "def create_encode_udf(query_prefix):\n", + " # Define a function to encode text in batches\n", + " # def encode_text_batch(texts):\n", + " def encode_text_batch():\n", + " # Load the model inside the function\n", + " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", + " model.max_seq_length = 4096\n", + " model.tokenizer.padding_side = \"right\"\n", + "\n", + " def predict(inputs):\n", + "\n", + " output = model.encode(\n", + " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", + " )\n", + " return output\n", + "\n", + " return predict\n", + "\n", + " # # Encode the texts in batch\n", + " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", + " # return [embedding.tolist() for embedding in embeddings]\n", + "\n", + " # Define the predict_batch_udf with the above function\n", + " return predict_batch_udf(\n", + " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Use it withhout query_prefix in this case\n", + "query_prefix = \"\"\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "# Applying the UDF to a DataFrame chunk column\n", + "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "rapids_knn_model = (\n", + " ApproximateNearestNeighbors(k=2)\n", + " .setInputCol(\"embeddings\")\n", + " .setIdCol(\"id\")\n", + " .fit(embeddings)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "17b3890f-4163-443c-929b-252d62a6c736", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 6: Compose a Question." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", + "\n", + "task_name_to_instruct = {\n", + " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", + "}\n", + "\n", + "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", + "\n", + "encode_udf = create_encode_udf(query_prefix)\n", + "\n", + "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "# Define schema explicitly\n", + "schema = StructType(\n", + " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", + ")\n", + "\n", + "# Create DataFrame with id = 1 and the user query\n", + "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", + "\n", + "# Apply the UDF to generate the embeddings\n", + "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 7: Find chunks with the closest context to the question using embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", + " query_embeddings.select(\"id\", \"embeddings\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Add text to the results\n", + "result_df = (\n", + " knn_df.withColumn(\n", + " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", + " )\n", + " .select(\n", + " F.col(\"query_id\"),\n", + " F.col(\"zipped.indices\").alias(\"id\"),\n", + " F.col(\"zipped.distances\").alias(\"distance\"),\n", + " )\n", + " .join(embeddings, on=\"id\", how=\"inner\")\n", + " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", + "concatenated_text = result_df.agg(\n", + " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", + ").collect()[0][\"concatenated_text\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", + "showTitle": false, + "title": "" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", + "\n", + "# Put model in global if we want to reuse it\n", + "global llm\n", + "\n", + "if \"llm\" in globals() and llm is not None:\n", + " print(\"Model is already loaded.\")\n", + "else:\n", + " print(\"Model is not loaded.\")\n", + "\n", + " # Extend model input sizes\n", + " build_config = BuildConfig()\n", + " build_config.plugin_config.context_fmha = True\n", + " build_config.max_input_len = 5120\n", + " build_config.max_seq_len = 5632\n", + "\n", + " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", + "\n", + "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", + "\n", + "context = concatenated_text\n", + "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", + "\n", + "prompt = f\"\"\"\n", + "context: {context}\n", + "Answer the question based only on the context above. Without multiple choices. If the\n", + "information to answer the question is not present in the given context then reply \"I don't know\".\n", + "My Question: {query}\n", + "What is your Answer? \"\"\"\n", + "\n", + "outputs = llm.generate(prompt, sampling_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Step 9: Print LLM results" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7fb3394-289f-4949-835e-3520323a770d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "output_text = outputs.outputs[0].text\n", + "\n", + "# Split the text by '\\n'\n", + "split_text = output_text.split(\"\\n\")\n", + "\n", + "for item in split_text:\n", + " if len(item) > 10:\n", + " # Split the item at the colon and take the part after it\n", + " result = item.split(\":\", 1)[-1].strip()\n", + " print(\"Answer: \" + result)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 18fd9b3fc115b70d67da3de9db054687a6cdd717 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:21:08 -0700 Subject: [PATCH 67/83] Delete docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb --- ...Question - Answering with LLM on GPU.ipynb | 812 ------------------ 1 file changed, 812 deletions(-) delete mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb deleted file mode 100644 index bb9575d250..0000000000 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb +++ /dev/null @@ -1,812 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Introduction\n", - "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", - "\n", - "We’ll cover the following key stages:\n", - "\n", - "1. Load PDF documents using PyMUPDF library.\n", - "2. Use SynapseML to split the documents into chunks.\n", - "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", - "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", - "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", - "\n", - "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 1: Define the notebook environment" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import fitz\n", - "import pyspark.sql.functions as F\n", - "from pyspark.sql.types import ArrayType, FloatType, StringType\n", - "from pyspark.sql.functions import (\n", - " explode,\n", - " col,\n", - " monotonically_increasing_id,\n", - " concat_ws,\n", - " collect_list,\n", - ")\n", - "from pyspark.ml.functions import predict_batch_udf\n", - "from sentence_transformers import SentenceTransformer\n", - "from synapse.ml.featurize.text import PageSplitter\n", - "from spark_rapids_ml.knn import (\n", - " ApproximateNearestNeighbors,\n", - " ApproximateNearestNeighborsModel,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 2: Load the documents into a Spark DataFrame." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", - "df = spark.read.format(\"binaryFile\").load(document_path).cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define the function to extract text from binary PDF data\n", - "def extract_text_from_binary_pdf(binary_content):\n", - " try:\n", - " # Create a PyMuPDF document from the binary data\n", - " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", - " text = \"\"\n", - " for page in doc:\n", - " text += page.get_text()\n", - " return text\n", - " except Exception as e:\n", - " return str(e)\n", - "\n", - "\n", - "# Register the function as a UDF\n", - "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", - "\n", - "\n", - "# Apply the UDF to extract text from the binary content\n", - "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d26e4217-ac87-4583-9500-af65d969c199", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "ps = (\n", - " PageSplitter()\n", - " .setInputCol(\"output_content\")\n", - " .setMaximumPageLength(4000)\n", - " .setMinimumPageLength(3000)\n", - " .setOutputCol(\"chunks\")\n", - ")\n", - "\n", - "splitted_df = ps.transform(analyzed_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Each column contains many chunks for the same document as a vector.\n", - "# Explode will distribute and replicate the content of a vecor across multple rows\n", - "# Add id column\n", - "\n", - "exploded_df = (\n", - " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", - " .select(\"path\", \"chunk\")\n", - " .withColumn(\"id\", monotonically_increasing_id())\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 4: Generate Embeddings." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define a function to create the encode_udf with a custom query_prefix\n", - "def create_encode_udf(query_prefix):\n", - " # Define a function to encode text in batches\n", - " # def encode_text_batch(texts):\n", - " def encode_text_batch():\n", - " # Load the model inside the function\n", - " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", - " model.max_seq_length = 4096\n", - " model.tokenizer.padding_side = \"right\"\n", - "\n", - " def predict(inputs):\n", - "\n", - " output = model.encode(\n", - " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", - " )\n", - " return output\n", - "\n", - " return predict\n", - "\n", - " # # Encode the texts in batch\n", - " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", - " # return [embedding.tolist() for embedding in embeddings]\n", - "\n", - " # Define the predict_batch_udf with the above function\n", - " return predict_batch_udf(\n", - " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Use it withhout query_prefix in this case\n", - "query_prefix = \"\"\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "# Applying the UDF to a DataFrame chunk column\n", - "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "rapids_knn_model = (\n", - " ApproximateNearestNeighbors(k=2)\n", - " .setInputCol(\"embeddings\")\n", - " .setIdCol(\"id\")\n", - " .fit(embeddings)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "17b3890f-4163-443c-929b-252d62a6c736", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 6: Compose a Question." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", - "\n", - "task_name_to_instruct = {\n", - " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", - "}\n", - "\n", - "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", - "\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "# Define schema explicitly\n", - "schema = StructType(\n", - " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", - ")\n", - "\n", - "# Create DataFrame with id = 1 and the user query\n", - "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", - "\n", - "# Apply the UDF to generate the embeddings\n", - "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 7: Find chunks with the closest context to the question using embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", - " query_embeddings.select(\"id\", \"embeddings\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Add text to the results\n", - "result_df = (\n", - " knn_df.withColumn(\n", - " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", - " )\n", - " .select(\n", - " F.col(\"query_id\"),\n", - " F.col(\"zipped.indices\").alias(\"id\"),\n", - " F.col(\"zipped.distances\").alias(\"distance\"),\n", - " )\n", - " .join(embeddings, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", - "concatenated_text = result_df.agg(\n", - " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", - ").collect()[0][\"concatenated_text\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", - "\n", - "# Put model in global if we want to reuse it\n", - "global llm\n", - "\n", - "if \"llm\" in globals() and llm is not None:\n", - " print(\"Model is already loaded.\")\n", - "else:\n", - " print(\"Model is not loaded.\")\n", - "\n", - " # Extend model input sizes\n", - " build_config = BuildConfig()\n", - " build_config.plugin_config.context_fmha = True\n", - " build_config.max_input_len = 5120\n", - " build_config.max_seq_len = 5632\n", - "\n", - " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", - "\n", - "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", - "\n", - "context = concatenated_text\n", - "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "\n", - "prompt = f\"\"\"\n", - "context: {context}\n", - "Answer the question based only on the context above. Without multiple choices. If the\n", - "information to answer the question is not present in the given context then reply \"I don't know\".\n", - "My Question: {query}\n", - "What is your Answer? \"\"\"\n", - "\n", - "outputs = llm.generate(prompt, sampling_params)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 9: Print LLM results" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b7fb3394-289f-4949-835e-3520323a770d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "output_text = outputs.outputs[0].text\n", - "\n", - "# Split the text by '\\n'\n", - "split_text = output_text.split(\"\\n\")\n", - "\n", - "for item in split_text:\n", - " if len(item) > 10:\n", - " # Split the item at the colon and take the part after it\n", - " result = item.split(\":\", 1)[-1].strip()\n", - " print(\"Answer: \" + result)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "environmentMetadata": { - "base_environment": "", - "client": "1" - }, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 59568a0e7494440aa87829a12c43b33fb19918ce Mon Sep 17 00:00:00 2001 From: bvonodiripsa Date: Fri, 18 Oct 2024 20:42:23 +0000 Subject: [PATCH 68/83] Fix batch size --- .../main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py index 69eca7add1..16ffe45642 100644 --- a/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py +++ b/deep-learning/src/main/python/synapse/ml/hf/HuggingFaceSentenceEmbedder.py @@ -144,7 +144,7 @@ def _get_dataloader(): input_data = self.optData return [ ( - 0, + BATCH_SIZE_DEFAULT, ( input_data, {"show_progress_bar": False, "batch_size": self.getBatchSize()}, From 9cf90b117a00e1f38625addb2f1510764d5b5696 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:28:41 -0700 Subject: [PATCH 69/83] Update OpenAIPrompt.scala to match main --- .../synapse/ml/services/openai/OpenAIPrompt.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala index c307f3bf35..a43f3ffe3a 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPrompt.scala @@ -16,7 +16,7 @@ import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transf import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.sql.{Column, DataFrame, Dataset, functions => F, types => T} +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, functions => F, types => T} import scala.collection.JavaConverters._ @@ -200,6 +200,16 @@ class OpenAIPrompt(override val uid: String) extends Transformer completion } + override protected def prepareEntity: Row => Option[AbstractHttpEntity] = { + r => + openAICompletion match { + case chatCompletion: OpenAIChatCompletion => + chatCompletion.prepareEntity(r) + case completion: OpenAICompletion => + completion.prepareEntity(r) + } + } + private def getParser: OutputParser = { val opts = getPostProcessingOptions From 9567a7b6f3943ddb487473c1718b8a602dc378e4 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:31:08 -0700 Subject: [PATCH 70/83] Update OpenAIChatCompletion.scala to sync with main --- .../synapse/ml/services/openai/OpenAIChatCompletion.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala index 5643deb8f8..cfc6af9eec 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala @@ -37,7 +37,7 @@ class OpenAIChatCompletion(override val uid: String) extends OpenAIServicesBase( s"${getUrl}openai/deployments/${getValue(row, deploymentName)}/chat/completions" } - override protected def prepareEntity: Row => Option[AbstractHttpEntity] = { + override protected[openai] def prepareEntity: Row => Option[AbstractHttpEntity] = { r => lazy val optionalParams: Map[String, Any] = getOptionalParams(r) val messages = r.getAs[Seq[Row]](getMessagesCol) @@ -65,6 +65,3 @@ class OpenAIChatCompletion(override val uid: String) extends OpenAIServicesBase( } } - - - From 129c510af4b874149d647122bcceab2089ee3612 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:32:47 -0700 Subject: [PATCH 71/83] Update OpenAICompletion.scala to match main --- .../azure/synapse/ml/services/openai/OpenAICompletion.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala index 0e7554f9b6..9773dea58b 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala @@ -37,7 +37,7 @@ class OpenAICompletion(override val uid: String) extends OpenAIServicesBase(uid) s"${getUrl}openai/deployments/${getValue(row, deploymentName)}/completions" } - override protected def prepareEntity: Row => Option[AbstractHttpEntity] = { + override protected[openai] def prepareEntity: Row => Option[AbstractHttpEntity] = { r => lazy val optionalParams: Map[String, Any] = getOptionalParams(r) getValueOpt(r, prompt) @@ -61,4 +61,3 @@ class OpenAICompletion(override val uid: String) extends OpenAIServicesBase(uid) new StringEntity(fullPayload.toJson.compactPrint, ContentType.APPLICATION_JSON) } } - From d2c6446b172e377fca6527a8f79de4eedf99c1bb Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:33:59 -0700 Subject: [PATCH 72/83] Update OpenAICompletion.scala --- .../azure/synapse/ml/services/openai/OpenAICompletion.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala index 9773dea58b..edf0566142 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala @@ -61,3 +61,6 @@ class OpenAICompletion(override val uid: String) extends OpenAIServicesBase(uid) new StringEntity(fullPayload.toJson.compactPrint, ContentType.APPLICATION_JSON) } } + + + From c1121822f3bdfaa9f2cd8109b40d3610154d7d14 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:34:38 -0700 Subject: [PATCH 73/83] Update OpenAICompletion.scala --- .../azure/synapse/ml/services/openai/OpenAICompletion.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala index edf0566142..219bc34d87 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletion.scala @@ -62,5 +62,3 @@ class OpenAICompletion(override val uid: String) extends OpenAIServicesBase(uid) } } - - From 597151a4e82ac724f841ab2aaf148931ece26ce7 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:35:28 -0700 Subject: [PATCH 74/83] Update OpenAIChatCompletion.scala --- .../azure/synapse/ml/services/openai/OpenAIChatCompletion.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala index cfc6af9eec..80afc463d7 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala @@ -65,3 +65,5 @@ class OpenAIChatCompletion(override val uid: String) extends OpenAIServicesBase( } } + + From f2a21a43156e7d1ea71f6f5946ccdb00943734dc Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:35:55 -0700 Subject: [PATCH 75/83] Update OpenAIChatCompletion.scala --- .../azure/synapse/ml/services/openai/OpenAIChatCompletion.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala index 80afc463d7..c6a5d87e7b 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala @@ -66,4 +66,3 @@ class OpenAIChatCompletion(override val uid: String) extends OpenAIServicesBase( } - From 711ae070b00c2138497c233c26fd4bb51d88df52 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:36:43 -0700 Subject: [PATCH 76/83] Update OpenAIChatCompletion.scala --- .../azure/synapse/ml/services/openai/OpenAIChatCompletion.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala index c6a5d87e7b..379d797766 100644 --- a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala +++ b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletion.scala @@ -66,3 +66,5 @@ class OpenAIChatCompletion(override val uid: String) extends OpenAIServicesBase( } + + From 232a743883a1a2c91f15a953443c1a11acad827c Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:39:26 -0700 Subject: [PATCH 77/83] Update OpenAIPromptSuite.scala --- .../services/openai/OpenAIPromptSuite.scala | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPromptSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPromptSuite.scala index f14554b9e8..9409d2f358 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPromptSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIPromptSuite.scala @@ -122,6 +122,33 @@ class OpenAIPromptSuite extends TransformerFuzzing[OpenAIPrompt] with OpenAIAPIK .foreach(r => assert(r.get(0) != null)) } + ignore("Custom EndPoint") { + lazy val accessToken: String = sys.env.getOrElse("CUSTOM_ACCESS_TOKEN", "") + lazy val customRootUrlValue: String = sys.env.getOrElse("CUSTOM_ROOT_URL", "") + lazy val customHeadersValues: Map[String, String] = Map("X-ModelType" -> "gpt-4-turbo-chat-completions") + + lazy val customPromptGpt4: OpenAIPrompt = new OpenAIPrompt() + .setCustomUrlRoot(customRootUrlValue) + .setOutputCol("outParsed") + .setTemperature(0) + + if (accessToken.isEmpty) { + customPromptGpt4.setSubscriptionKey(openAIAPIKey) + .setDeploymentName(deploymentNameGpt4) + .setCustomServiceName(openAIServiceName) + } else { + customPromptGpt4.setAADToken(accessToken) + .setCustomHeaders(customHeadersValues) + } + + customPromptGpt4.setPromptTemplate("here is a comma separated list of 5 {category}: {text}, ") + .setPostProcessing("csv") + .transform(df) + .select("outParsed") + .collect() + .count(r => Option(r.getSeq[String](0)).isDefined) + } + override def assertDFEq(df1: DataFrame, df2: DataFrame)(implicit eq: Equality[DataFrame]): Unit = { super.assertDFEq(df1.drop("out", "outParsed"), df2.drop("out", "outParsed"))(eq) } From 51bab746cc55bb237d434826e6c78cce912d4b84 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:41:30 -0700 Subject: [PATCH 78/83] Update SynapseMLLogger.py --- .../ml/core/logging/SynapseMLLogger.py | 73 ++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/core/src/main/python/synapse/ml/core/logging/SynapseMLLogger.py b/core/src/main/python/synapse/ml/core/logging/SynapseMLLogger.py index f1dd6d79bc..37fed9e9da 100644 --- a/core/src/main/python/synapse/ml/core/logging/SynapseMLLogger.py +++ b/core/src/main/python/synapse/ml/core/logging/SynapseMLLogger.py @@ -126,9 +126,18 @@ def _log_base( num_cols: int, execution_sec: float, feature_name: Optional[str] = None, + custom_log_dict: Optional[Dict[str, str]] = None, ): + payload_dict = self._get_payload( + class_name, method_name, num_cols, execution_sec, None + ) + if custom_log_dict: + if shared_keys := set(custom_log_dict.keys()) & set(payload_dict.keys()): + raise ValueError( + f"Shared keys found in custom logger dictionary: {shared_keys}" + ) self._log_base_dict( - self._get_payload(class_name, method_name, num_cols, execution_sec, None), + payload_dict | (custom_log_dict if custom_log_dict else {}), feature_name=feature_name, ) @@ -213,6 +222,66 @@ def log_decorator_wrapper(self, *args, **kwargs): return get_wrapper + @staticmethod + def log_verb_static( + method_name: Optional[str] = None, + feature_name: Optional[str] = None, + custom_log_function=None, + ): + def get_wrapper(func): + @functools.wraps(func) + def log_decorator_wrapper(self, *args, **kwargs): + # Validate that self._logger is set + if not hasattr(self, "_logger"): + raise AttributeError( + f"{self.__class__.__name__} does not have a '_logger' attribute. " + "Ensure a _logger instance is initialized in the constructor." + ) + + # Validate custom_log_function for proper definition + if custom_log_function: + if not callable(custom_log_function): + raise ValueError("custom_log_function must be callable") + + logger = self._logger + start_time = time.perf_counter() + try: + result = func(self, *args, **kwargs) + execution_time = logger._round_significant( + time.perf_counter() - start_time, 3 + ) + # Create custom logs if necessary + custom_log_dict = None + if custom_log_function: + custom_log_dict = custom_log_function( + self, result, *args, **kwargs + ) + if not isinstance(custom_log_dict, dict): + raise TypeError( + "custom_log_function must return a Dict[str, str]" + ) + + logger._log_base( + func.__module__, + method_name if method_name else func.__name__, + logger.get_column_number(args, kwargs), + execution_time, + feature_name, + custom_log_dict, + ) + return result + except Exception as e: + logger._log_error_base( + func.__module__, + method_name if method_name else func.__name__, + e, + ) + raise + + return log_decorator_wrapper + + return get_wrapper + def log_class(self, feature_name: str): return self._log_base("constructor", None, None, None, feature_name) @@ -224,7 +293,7 @@ def log_fit(): @classmethod def get_column_number(cls, args, kwargs): - if kwargs and kwargs["df"] and isinstance(kwargs["df"], DataFrame): + if kwargs and kwargs.get("df") and isinstance(kwargs["df"], DataFrame): return len(kwargs["df"].columns) elif args and len(args) > 0 and isinstance(args[0], DataFrame): return len(args[0].columns) From 70d2dcae2eba8d2b199790181386271c7443a866 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:42:52 -0700 Subject: [PATCH 79/83] Update test_logging.py --- .../python/synapsemltest/core/test_logging.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/core/src/test/python/synapsemltest/core/test_logging.py b/core/src/test/python/synapsemltest/core/test_logging.py index 83a4b9b1fd..e419fd4965 100644 --- a/core/src/test/python/synapsemltest/core/test_logging.py +++ b/core/src/test/python/synapsemltest/core/test_logging.py @@ -35,6 +35,47 @@ def test_feature_name(self): return 0 +class NoInheritTransformer: + def __init__(self): + self._logger = SynapseMLLogger(log_level=logging.DEBUG) + + @SynapseMLLogger.log_verb_static(method_name="transform") + def transform(self, df): + return True + + @SynapseMLLogger.log_verb_static(method_name="fit") + def fit(self, df): + return True + + @SynapseMLLogger.log_verb_static() + def test_throw(self): + raise Exception("test exception") + + @SynapseMLLogger.log_verb_static(feature_name="test_logging") + def test_feature_name(self): + return 0 + + def custom_logging_function(self, results, *args, **kwargs): + return {"args": f"Arguments: {args}", "result": str(results)} + + @SynapseMLLogger.log_verb_static(custom_log_function=custom_logging_function) + def test_custom_function(self, df): + return 0 + + def custom_logging_function_w_collision(self, results, *args, **kwargs): + return { + "args": f"Arguments: {args}", + "result": str(results), + "className": "this is the collision key", + } + + @SynapseMLLogger.log_verb_static( + custom_log_function=custom_logging_function_w_collision + ) + def test_custom_function_w_collision(self, df): + return 0 + + class LoggingTest(unittest.TestCase): def test_logging_smoke(self): t = SampleTransformer() @@ -49,6 +90,26 @@ def test_logging_smoke(self): assert f"{e}" == "test exception" t.test_feature_name() + def test_log_verb_static(self): + t = NoInheritTransformer() + data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)] + columns = ["name", "age"] + df = sc.createDataFrame(data, columns) + t.transform(df) + t.fit(df) + t.test_feature_name() + t.test_custom_function(df) + try: + t.test_throw() + except Exception as e: + assert f"{e}" == "test exception" + try: + t.test_custom_function_w_collision(df) + except Exception as e: + assert ( + f"{e}" == "Shared keys found in custom logger dictionary: {'className'}" + ) + if __name__ == "__main__": result = unittest.main() From 0159897978c3480b9f09f66f86fcc74ce944ce48 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:45:15 -0700 Subject: [PATCH 80/83] Update DatabricksUtilities.scala --- .../microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala index 7076b6b087..55e0fbdfce 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala @@ -443,7 +443,7 @@ abstract class DatabricksTestHelper extends TestBase { assert(notebooks.nonEmpty) - val maxConcurrency = 10 + val maxConcurrency = 8 val executorService = Executors.newFixedThreadPool(maxConcurrency) implicit val executionContext: ExecutionContext = ExecutionContext.fromExecutor(executorService) From 9dab2d6633cc163adf0a7cb47c543d0bfb77a953 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:51:00 -0700 Subject: [PATCH 81/83] Update environment.yml --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 477edb9c84..e9361ad5de 100644 --- a/environment.yml +++ b/environment.yml @@ -28,8 +28,8 @@ dependencies: - ipython - pytest-codeblocks - azure-storage-blob - - twine - jupyter + - twine - mlflow - numpy - torch==2.0.0 From dd6cffa16ba12f9a7a115441b1e609765cd57f85 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:51:49 -0700 Subject: [PATCH 82/83] Update conda.yml --- templates/conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/conda.yml b/templates/conda.yml index d89499de97..8c36f95929 100644 --- a/templates/conda.yml +++ b/templates/conda.yml @@ -16,7 +16,7 @@ steps: timeoutInMinutes: 20 retryCountOnTaskFailure: 1 - bash: | - (timeout 30m conda env create --yes -f environment.yml -v) || (timeout 30m conda env create --force -f environment.yml -v) + (timeout 30m conda env create --yes -f environment.yml -v) || (timeout 30m conda env create --yes -f environment.yml -v) displayName: Create Anaconda environment retryCountOnTaskFailure: 1 condition: eq(variables.CONDA_CACHE_RESTORED, 'false') From ebb2976c1cf112efa9e6ebfdc9b71c770c852770 Mon Sep 17 00:00:00 2001 From: Alexander <157773158+bvonodiripsa@users.noreply.github.com> Date: Sun, 20 Oct 2024 13:09:05 -0700 Subject: [PATCH 83/83] Delete docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb --- ...Question - Answering with LLM on GPU.ipynb | 812 ------------------ 1 file changed, 812 deletions(-) delete mode 100644 docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb diff --git a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb b/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb deleted file mode 100644 index bb9575d250..0000000000 --- a/docs/Explore Algorithms/AI Services/QuickStart - Distributed Question - Answering with LLM on GPU.ipynb +++ /dev/null @@ -1,812 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6b31dee8-67e3-4bb7-a501-269c69c80d3f", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# A Guide to Q&A using Retrieval-Augmented Generation (RAG) with distributed local LLM embedding and generation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b4000620-9ea1-45aa-be4f-ddb971cc708e", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Introduction\n", - "In this notebook, we'll demonstrate how to develop a context-aware question answering framework using distributed local LLM embedding and answer generation using Hugging Face models: [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) and [NV-Embed-v1](https://huggingface.co/bzantium/NV-Embed-v1). This notebook extending document Question and Answering demo to use only local models for scalability and acceleration. Question and Answering contect is based on NASA's Earth and Earth at Night e-books. \n", - "\n", - "We’ll cover the following key stages:\n", - "\n", - "1. Load PDF documents using PyMUPDF library.\n", - "2. Use SynapseML to split the documents into chunks.\n", - "3. Generate chunk and user question embeddings using NV-Embed-V1 embedder\n", - "4. Using NVIDIA Rapids KNN find chunks related to user questions to define context for LLM answers\n", - "5. Using LLM Phi-3 from Microsoft and Tensor-RT GPU accelerator answer user questions using provided context\n", - "\n", - "The demo was tested on NVIDIA A100 based Databricks Azure cluster with two workers based on Standard_NC24ads_A100_v4 using 13.3 LTS ML (includes Apache Spark 3.4.1, GPU, Scala 2.12) Databricks Runtime.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "db0faebe-2cca-4bd8-ae28-645e69a21bb7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 1: Define the notebook environment" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "530e6ef4-b620-443e-a051-4164aedc43cd", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import fitz\n", - "import pyspark.sql.functions as F\n", - "from pyspark.sql.types import ArrayType, FloatType, StringType\n", - "from pyspark.sql.functions import (\n", - " explode,\n", - " col,\n", - " monotonically_increasing_id,\n", - " concat_ws,\n", - " collect_list,\n", - ")\n", - "from pyspark.ml.functions import predict_batch_udf\n", - "from sentence_transformers import SentenceTransformer\n", - "from synapse.ml.featurize.text import PageSplitter\n", - "from spark_rapids_ml.knn import (\n", - " ApproximateNearestNeighbors,\n", - " ApproximateNearestNeighborsModel,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "97f056e7-9f88-45b9-b6b2-95be8c7fccac", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 2: Load the documents into a Spark DataFrame." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "eb6519d4-f03a-4359-8a6f-4922bfeedbf5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "fb39f605-39f8-46d1-a9d3-28b854586852", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", - "df = spark.read.format(\"binaryFile\").load(document_path).cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "34e06daf-e9e7-4144-b956-e57bde8fab77", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 3: Read the document context and convert it from PDF to text using PyMUPDF library." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "304ed77d-a032-4620-a74d-65a277caeaf7", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We utilize PyMUPDF library (fitz) to do PDF to Text conversion" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "04b58ec9-8a8e-4575-9df9-c8e84c6c4a64", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define the function to extract text from binary PDF data\n", - "def extract_text_from_binary_pdf(binary_content):\n", - " try:\n", - " # Create a PyMuPDF document from the binary data\n", - " doc = fitz.open(stream=binary_content, filetype=\"pdf\")\n", - " text = \"\"\n", - " for page in doc:\n", - " text += page.get_text()\n", - " return text\n", - " except Exception as e:\n", - " return str(e)\n", - "\n", - "\n", - "# Register the function as a UDF\n", - "extract_text_udf = udf(extract_text_from_binary_pdf, StringType())\n", - "\n", - "\n", - "# Apply the UDF to extract text from the binary content\n", - "analyzed_df = df.withColumn(\"output_content\", extract_text_udf(df[\"content\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d26e4217-ac87-4583-9500-af65d969c199", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "We can split Spark DataFrame named ```analyzed_df``` in chunks to make book analysed context smaller (3000 - 4000 char) using the following code." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1b471060-8175-492e-bbb3-5b3529480b33", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "ps = (\n", - " PageSplitter()\n", - " .setInputCol(\"output_content\")\n", - " .setMaximumPageLength(4000)\n", - " .setMinimumPageLength(3000)\n", - " .setOutputCol(\"chunks\")\n", - ")\n", - "\n", - "splitted_df = ps.transform(analyzed_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "d51caf1d-322e-480b-8391-d266aed6401e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Each column contains many chunks for the same document as a vector.\n", - "# Explode will distribute and replicate the content of a vecor across multple rows\n", - "# Add id column\n", - "\n", - "exploded_df = (\n", - " splitted_df.select(\"path\", explode(col(\"chunks\")).alias(\"chunk\"))\n", - " .select(\"path\", \"chunk\")\n", - " .withColumn(\"id\", monotonically_increasing_id())\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1e5b0f56-0a64-4e4a-86f2-b647e82b41ce", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 4: Generate Embeddings." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "ebba439c-9503-46d7-bafb-f7fa790974a8", - "showTitle": false, - "title": "" - } - }, - "source": [ - "To produce embeddings for each chunk, we utilize NVIDIA NV-Embed-V1 embedder from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "1f41cd67-1a27-4e69-959a-e5002b4fbbaf", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Define a function to create the encode_udf with a custom query_prefix\n", - "def create_encode_udf(query_prefix):\n", - " # Define a function to encode text in batches\n", - " # def encode_text_batch(texts):\n", - " def encode_text_batch():\n", - " # Load the model inside the function\n", - " model = SentenceTransformer(\"bzantium/NV-Embed-v1\", trust_remote_code=True)\n", - " model.max_seq_length = 4096\n", - " model.tokenizer.padding_side = \"right\"\n", - "\n", - " def predict(inputs):\n", - "\n", - " output = model.encode(\n", - " inputs.tolist(), prompt=query_prefix, normalize_embeddings=True\n", - " )\n", - " return output\n", - "\n", - " return predict\n", - "\n", - " # # Encode the texts in batch\n", - " # embeddings = model.encode(inputs.tolist(), normalize_embeddings=True)\n", - " # return [embedding.tolist() for embedding in embeddings]\n", - "\n", - " # Define the predict_batch_udf with the above function\n", - " return predict_batch_udf(\n", - " encode_text_batch, return_type=ArrayType(FloatType()), batch_size=1\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "defe1c52-1637-4b55-aae5-00174057f1e4", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Use it withhout query_prefix in this case\n", - "query_prefix = \"\"\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "# Applying the UDF to a DataFrame chunk column\n", - "embeddings = exploded_df.withColumn(\"embeddings\", encode_udf(col(\"chunk\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "183c4c45-03bf-42d0-9c10-24e5fe9842da", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 5: Use chunk embeddings to create KNN search model to find chunks related to user query " - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "5d725803-3475-4b97-aebc-24ae909eebbc", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "rapids_knn_model = (\n", - " ApproximateNearestNeighbors(k=2)\n", - " .setInputCol(\"embeddings\")\n", - " .setIdCol(\"id\")\n", - " .fit(embeddings)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "17b3890f-4163-443c-929b-252d62a6c736", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 6: Compose a Question." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8826a0fb-7b41-47a9-8d65-8885dcb1248d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", - "\n", - "task_name_to_instruct = {\n", - " \"example\": \"Given a question, retrieve passages from the provided context that answer the question\",\n", - "}\n", - "\n", - "query_prefix = \"Instruct: \" + task_name_to_instruct[\"example\"] + \"\\nQuery: \"\n", - "\n", - "encode_udf = create_encode_udf(query_prefix)\n", - "\n", - "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "# Define schema explicitly\n", - "schema = StructType(\n", - " [StructField(\"id\", IntegerType(), True), StructField(\"query\", StringType(), True)]\n", - ")\n", - "\n", - "# Create DataFrame with id = 1 and the user query\n", - "temp_df = spark.createDataFrame([(1, user_question)], schema).cache()\n", - "\n", - "# Apply the UDF to generate the embeddings\n", - "query_embeddings = temp_df.withColumn(\"embeddings\", encode_udf(col(\"query\")))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "45f79485-be0f-4b89-9c11-79f9102436e7", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 7: Find chunks with the closest context to the question using embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "45f7b558-4c32-4e08-807b-9e568dcde8df", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "(_, _, knn_df) = rapids_knn_model.kneighbors(\n", - " query_embeddings.select(\"id\", \"embeddings\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "da023b2f-d7d7-4937-8139-6ec999a77cc6", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Add text to the results\n", - "result_df = (\n", - " knn_df.withColumn(\n", - " \"zipped\", F.explode(F.arrays_zip(F.col(\"indices\"), F.col(\"distances\")))\n", - " )\n", - " .select(\n", - " F.col(\"query_id\"),\n", - " F.col(\"zipped.indices\").alias(\"id\"),\n", - " F.col(\"zipped.distances\").alias(\"distance\"),\n", - " )\n", - " .join(embeddings, on=\"id\", how=\"inner\")\n", - " .select(\"query_id\", \"id\", \"chunk\", \"distance\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0180ef1c-3d59-4922-b918-80eaf7badd9d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Concatenate all strings in the 'combined_text' column across all question related chunks\n", - "concatenated_text = result_df.agg(\n", - " concat_ws(\" \", collect_list(\"chunk\")).alias(\"concatenated_text\")\n", - ").collect()[0][\"concatenated_text\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "79356cff-a236-4ef3-91f7-a601ee38d5f9", - "showTitle": false, - "title": "" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "### Step 8: Respond to a User’s Question using microsoft/Phi-3-mini-4k-instruct LLM from Hugging Face" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "0b8b0fda-bca7-4cd1-ae0f-8438ca2cbf3b", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from tensorrt_llm import LLM, SamplingParams, BuildConfig\n", - "\n", - "# Put model in global if we want to reuse it\n", - "global llm\n", - "\n", - "if \"llm\" in globals() and llm is not None:\n", - " print(\"Model is already loaded.\")\n", - "else:\n", - " print(\"Model is not loaded.\")\n", - "\n", - " # Extend model input sizes\n", - " build_config = BuildConfig()\n", - " build_config.plugin_config.context_fmha = True\n", - " build_config.max_input_len = 5120\n", - " build_config.max_seq_len = 5632\n", - "\n", - " llm = LLM(model=\"microsoft/Phi-3-mini-4k-instruct\", build_config=build_config)\n", - "\n", - "sampling_params = SamplingParams(temperature=0.8, top_p=0.95)\n", - "\n", - "context = concatenated_text\n", - "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", - "\n", - "prompt = f\"\"\"\n", - "context: {context}\n", - "Answer the question based only on the context above. Without multiple choices. If the\n", - "information to answer the question is not present in the given context then reply \"I don't know\".\n", - "My Question: {query}\n", - "What is your Answer? \"\"\"\n", - "\n", - "outputs = llm.generate(prompt, sampling_params)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "2c65275c-17dc-4a30-83eb-ee5b6695a540", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Step 9: Print LLM results" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b7fb3394-289f-4949-835e-3520323a770d", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "output_text = outputs.outputs[0].text\n", - "\n", - "# Split the text by '\\n'\n", - "split_text = output_text.split(\"\\n\")\n", - "\n", - "for item in split_text:\n", - " if len(item) > 10:\n", - " # Split the item at the colon and take the part after it\n", - " result = item.split(\":\", 1)[-1].strip()\n", - " print(\"Answer: \" + result)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "412d83cc-4fe9-455e-ad3d-7780ed262dac", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "environmentMetadata": { - "base_environment": "", - "client": "1" - }, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "QuickStart - Distributed Question - Answering with LLM on GPU", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}