microsoft · JessicaXYWang · Jul 10, 2023 · Jul 10, 2023 · Jul 12, 2023 · Jul 12, 2023
@@ -1,2 +1,2 @@
 {:allowed-branchname-matches ["master" "release-.*"]
- :allowed-filename-matches ["notebooks" "website"]}
+ :allowed-filename-matches ["docs" "website"]}
@@ -0,0 +1,26 @@
+name: "Check Dead Links"
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: [ "master" ]
+
+jobs:
+  scan_links:
+    name: Scan Website for Dead Links
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y wget
+      - name: Scan for dead links
+        run: |
+          wget --spider --recursive --no-verbose --tries=3 --retry-connrefused --no-clobber --directory-prefix=site-check https://microsoft.github.io/SynapseML/
@@ -31,7 +31,7 @@ jobs:
          with:
            creds: ${{ secrets.clean_acr }}
        - name: checkout repo content
-         uses: actions/checkout@v3 # checkout the repo
+         uses: actions/checkout@v4 # checkout the repo
        - name: setup python
          uses: actions/setup-python@v4
          with:

@@ -42,7 +42,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL

@@ -10,7 +10,7 @@ jobs:
     name: Azure OpenAI PR Comment
     steps:
       - id: review
-        uses: microsoft/[email protected].4
+        uses: microsoft/[email protected].5
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           AZURE_OPENAI_API: ${{ secrets.AZURE_OPENAI_API }}

@@ -32,7 +32,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+        uses: actions/checkout@v4 # v3.1.0
         with:
           persist-credentials: false
 

@@ -86,3 +86,4 @@ metastore_db/
 **/build/*
 **/dist/*
 **/*.egg-info/*
+
@@ -45,14 +45,14 @@ this process:
 
 #### Implement tests
 
--   Set up build environment using the [developer guide](https://microsoft.github.io/SynapseML/docs/reference/developer-readme/)
+-   Set up build environment using the [developer guide](https://microsoft.github.io/SynapseML/docs/Reference/Developer%20Setup/)
 -   Test your code locally.
 -   Add tests using ScalaTests — unit tests are required.
 -   A sample notebook is required as an end-to-end test.
 
 #### Implement documentation
 
--   Add a [sample Jupyter notebook](notebooks/) that shows the intended use
+-   Add a [sample Jupyter notebook](docs/) that shows the intended use
     case of your algorithm, with instructions in step-by-step manner.  (The same
     notebook could be used for testing the code.)
 -   Add in-line ScalaDoc comments to your source code, to generate the [API

@@ -25,6 +25,7 @@ val coreDependencies = Seq(
   "org.apache.spark" %% "spark-mllib" % sparkVersion % "compile",
   "org.apache.spark" %% "spark-avro" % sparkVersion % "provided",
   "org.apache.spark" %% "spark-tags" % sparkVersion % "test",
+  "com.globalmentor" % "hadoop-bare-naked-local-fs" % "0.1.0" % "test",
   "org.scalatest" %% "scalatest" % "3.2.14" % "test")
 val extraDependencies = Seq(
   "org.scalactic" %% "scalactic" % "3.2.14",
@@ -220,7 +221,7 @@ publishDotnetBase := {
   packDotnetAssemblyCmd(join(dotnetBaseDir, "target").getAbsolutePath, dotnetBaseDir)
   val packagePath = join(dotnetBaseDir,
     // Update the version whenever there's a new release
-    "target", s"SynapseML.DotnetBase.${dotnetedVersion("0.11.1")}.nupkg").getAbsolutePath
+    "target", s"SynapseML.DotnetBase.${dotnetedVersion("0.11.2")}.nupkg").getAbsolutePath
   publishDotnetAssemblyCmd(packagePath, genSleetConfig.value)
 }
 
@@ -381,11 +382,11 @@ publishBadges := {
   uploadBadge("master version", version.value, "blue", "master_version3.svg")
 }
 
-val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload notebooks to blob storage")
+val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload docs to blob storage")
 uploadNotebooks := {
-  val localNotebooksFolder = join(baseDirectory.value.toString, "notebooks").toString
+  val localNotebooksFolder = join(baseDirectory.value.toString, "docs").toString
   val blobNotebooksFolder = version.value
-  uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "notebooks")
+  uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "docs")
 }
 
 val settings = Seq(
@@ -493,8 +494,8 @@ setupTask := {
 
 val convertNotebooks = TaskKey[Unit]("convertNotebooks", "convert notebooks to markdown for website display")
 convertNotebooks := {
-  runCmdStr("python -m docs.python.documentprojection " +
-    "--customchannels docs/python/synapseml_channels -c website . docs/manifest.yaml -p")
+  runCmd(Seq("pip", "install", "-e", "."), wd=join(baseDirectory.value, "tools/docgen"))
+  runCmd(Seq("python", "__main__.py"), wd=join(baseDirectory.value, "tools/docgen/docgen"))
 }
 
 val testWebsiteDocs = TaskKey[Unit]("testWebsiteDocs",

@@ -44,6 +44,7 @@
 )
 from pyspark.sql.functions import udf
 from typing import cast, Optional, TypeVar, Type
+from synapse.ml.core.platform import running_on_synapse_internal
 
 OPENAI_API_VERSION = "2022-12-01"
 RL = TypeVar("RL", bound="MLReadable")
@@ -125,6 +126,14 @@ def __init__(
         self.subscriptionKey = Param(self, "subscriptionKey", "openai api key")
         self.url = Param(self, "url", "openai api base")
         self.apiVersion = Param(self, "apiVersion", "openai api version")
+        self.running_on_synapse_internal = running_on_synapse_internal()
+        if running_on_synapse_internal():
+            from synapse.ml.fabric.service_discovery import get_fabric_env_config
+
+            self._setDefault(
+                url=get_fabric_env_config().fabric_env_config.ml_workload_endpoint
+                + "cognitive/openai"
+            )
         kwargs = self._input_kwargs
         if subscriptionKey:
             kwargs["subscriptionKey"] = subscriptionKey
@@ -196,10 +205,15 @@ def _transform(self, dataset):
         def udfFunction(x):
             import openai
 
-            openai.api_type = "azure"
-            openai.api_key = self.getSubscriptionKey()
-            openai.api_base = self.getUrl()
-            openai.api_version = self.getApiVersion()
+            if self.running_on_synapse_internal and not self.isSet(self.url):
+                from synapse.ml.fabric.prerun.openai_prerun import OpenAIPrerun
+
+                OpenAIPrerun(api_base=self.getUrl()).init_personalized_session(None)
+            else:
+                openai.api_type = "azure"
+                openai.api_key = self.getSubscriptionKey()
+                openai.api_base = self.getUrl()
+                openai.api_version = self.getApiVersion()
             return self.getChain().run(x)
 
         outCol = self.getOutputCol()

@@ -18,6 +18,8 @@ import org.apache.spark.internal.{Logging => SLogging}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.util._
 import org.apache.spark.ml.{ComplexParamsReadable, NamespaceInjections, PipelineModel}
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.functions.vector_to_array
 import org.apache.spark.sql.functions.{col, expr, struct, to_json}
 import org.apache.spark.sql.streaming.DataStreamWriter
 import org.apache.spark.sql.types._
@@ -142,7 +144,7 @@ class AddDocuments(override val uid: String) extends CognitiveServicesBase(uid)
   override def responseDataType: DataType = ASResponses.schema
 }
 
-object AzureSearchWriter extends IndexParser with SLogging {
+object AzureSearchWriter extends IndexParser with IndexJsonGetter with SLogging {
 
   val Logger: Logger = LogManager.getRootLogger
 
@@ -166,9 +168,11 @@ object AzureSearchWriter extends IndexParser with SLogging {
   private def convertFields(fields: Seq[StructField],
                             keyCol: String,
                             searchActionCol: String,
+                            vectorCols: Option[Seq[VectorColParams]],
                             prefix: Option[String]): Seq[IndexField] = {
     fields.filterNot(_.name == searchActionCol).map { sf =>
       val fullName = prefix.map(_ + sf.name).getOrElse(sf.name)
+      val isVector = vectorCols.exists(_.exists(_.name == fullName))
       val (innerType, _) = sparkTypeToEdmType(sf.dataType)
       IndexField(
         sf.name,
@@ -177,31 +181,44 @@ object AzureSearchWriter extends IndexParser with SLogging {
         if (keyCol == fullName) Some(true) else None,
         None, None, None, None,
         structFieldToSearchFields(sf.dataType,
-          keyCol, searchActionCol, prefix = Some(prefix.getOrElse("") + sf.name + "."))
+          keyCol, searchActionCol, None, prefix = Some(prefix.getOrElse("") + sf.name + ".")),
+        if (isVector) vectorCols.get.find(_.name == fullName).map(_.dimension) else None,
+        if (isVector) Some(AzureSearchAPIConstants.VectorConfigName) else None
       )
     }
   }
 
   private def structFieldToSearchFields(schema: DataType,
                                         keyCol: String,
                                         searchActionCol: String,
+                                        vectorCols: Option[Seq[VectorColParams]],
                                         prefix: Option[String] = None
                                        ): Option[Seq[IndexField]] = {
     schema match {
-      case StructType(fields) => Some(convertFields(fields, keyCol, searchActionCol, prefix))
-      case ArrayType(StructType(fields), _) => Some(convertFields(fields, keyCol, searchActionCol, prefix))
+      case StructType(fields) => Some(convertFields(fields, keyCol, searchActionCol, vectorCols, prefix))
+      // TODO: Support vector search in nested fields
+      case ArrayType(StructType(fields), _) => Some(convertFields(fields, keyCol, searchActionCol, None, prefix))
       case _ => None
     }
   }
 
+  private def parseVectorColsJson(str: String): Seq[VectorColParams] = {
+    str.parseJson.convertTo[Seq[VectorColParams]]
+  }
+
   private def dfToIndexJson(schema: StructType,
                             indexName: String,
                             keyCol: String,
-                            searchActionCol: String): String = {
+                            searchActionCol: String,
+                            vectorCols: Option[Seq[VectorColParams]]): String = {
+
+    val vectorConfig = Some(VectorSearch(Seq(AlgorithmConfigs(AzureSearchAPIConstants.VectorConfigName,
+      AzureSearchAPIConstants.VectorSearchAlgorithm))))
     val is = IndexInfo(
       Some(indexName),
-      structFieldToSearchFields(schema, keyCol, searchActionCol).get,
-      None, None, None, None, None, None, None, None
+      structFieldToSearchFields(schema, keyCol, searchActionCol, vectorCols).get,
+      None, None, None, None, None, None, None, None,
+      if (vectorCols.isEmpty) None else vectorConfig
     )
     is.toJson.compactPrint
   }
@@ -210,7 +227,7 @@ object AzureSearchWriter extends IndexParser with SLogging {
                         options: Map[String, String] = Map()): DataFrame = {
     val applicableOptions = Set(
       "subscriptionKey", "actionCol", "serviceName", "indexName", "indexJson",
-      "apiVersion", "batchSize", "fatalErrors", "filterNulls", "keyCol"
+      "apiVersion", "batchSize", "fatalErrors", "filterNulls", "keyCol", "vectorCols"
     )
 
     options.keys.foreach(k =>
@@ -224,11 +241,12 @@ object AzureSearchWriter extends IndexParser with SLogging {
     val batchSize = options.getOrElse("batchSize", "100").toInt
     val fatalErrors = options.getOrElse("fatalErrors", "true").toBoolean
     val filterNulls = options.getOrElse("filterNulls", "false").toBoolean
+    val vectorColsInfo = options.get("vectorCols")
 
     val keyCol = options.get("keyCol")
     val indexName = options.getOrElse("indexName", parseIndexJson(indexJsonOpt.get).name.get)
     if (indexJsonOpt.isDefined) {
-      List("keyCol", "indexName").foreach(opt =>
+      List("keyCol", "indexName", "vectorCols").foreach(opt =>
         assert(!options.contains(opt), s"Cannot set both indexJson options and $opt")
       )
     }
@@ -242,22 +260,41 @@ object AzureSearchWriter extends IndexParser with SLogging {
       }
     }
 
-    val indexJson = indexJsonOpt.getOrElse {
-      dfToIndexJson(df.schema, indexName, keyCol.get, actionCol)
+    val (indexJson, preppedDF) = if (getExisting(subscriptionKey, serviceName, apiVersion).contains(indexName)) {
+      if (indexJsonOpt.isDefined) {
+        println(f"indexJsonOpt is specified, however an index for $indexName already exists," +
+          f"we will use the index definition obtained from the existing index instead")
+      }
+      val existingIndexJson = getIndexJsonFromExistingIndex(subscriptionKey, serviceName, indexName)
+      val vectorColNameTypeTuple = getVectorColConf(existingIndexJson)
+      (existingIndexJson, makeColsCompatible(vectorColNameTypeTuple, df))
+    } else if (indexJsonOpt.isDefined) {
+      val vectorColNameTypeTuple = getVectorColConf(indexJsonOpt.get)
+      (indexJsonOpt.get, makeColsCompatible(vectorColNameTypeTuple, df))
+    } else {
+      val vectorCols = vectorColsInfo.map(parseVectorColsJson)
+      val vectorColNameTypeTuple = vectorCols.map(_.map(vc => (vc.name, "Collection(Edm.Single)"))).getOrElse(Seq.empty)
+      val newDF = makeColsCompatible(vectorColNameTypeTuple, df)
+      val inferredIndexJson = dfToIndexJson(newDF.schema, indexName, keyCol.getOrElse(""), actionCol, vectorCols)
+      (inferredIndexJson, newDF)
     }
 
+    // TODO: Support vector search in nested fields
+    // Throws an exception if any nested field is a vector in the schema
+    parseIndexJson(indexJson).fields.foreach(_.fields.foreach(assertNoNestedVectors))
+
     SearchIndex.createIfNoneExists(subscriptionKey, serviceName, indexJson, apiVersion)
 
     logInfo("checking schema parity")
-    checkSchemaParity(df.schema, indexJson, actionCol)
+    checkSchemaParity(preppedDF.schema, indexJson, actionCol)
 
     val df1 = if (filterNulls) {
       val collectionColumns = parseIndexJson(indexJson).fields
         .filter(_.`type`.startsWith("Collection"))
         .map(_.name)
-      collectionColumns.foldLeft(df) { (ndf, c) => filterOutNulls(ndf, c) }
+      collectionColumns.foldLeft(preppedDF) { (ndf, c) => filterOutNulls(ndf, c) }
     } else {
-      df
+      preppedDF
     }
 
     new AddDocuments()
@@ -273,6 +310,48 @@ object AzureSearchWriter extends IndexParser with SLogging {
         UDFUtils.oldUdf(checkForErrors(fatalErrors) _, ErrorUtils.ErrorSchema)(col("error"), col("input")))
   }
 
+  private def assertNoNestedVectors(fields: Seq[IndexField]): Unit = {
+    def checkVectorField(field: IndexField): Unit = {
+      if (field.dimensions.nonEmpty && field.vectorSearchConfiguration.nonEmpty) {
+        throw new IllegalArgumentException(s"Nested field ${field.name} is a vector field, vector fields in nested" +
+          s" fields are not supported.")
+      }
+      field.fields.foreach(_.foreach(checkVectorField))
+    }
+    fields.foreach(checkVectorField)
+  }
+
+  private def getVectorColConf(indexJson: String): Seq[(String, String)] = {
+    parseIndexJson(indexJson).fields
+      .filter(f => f.vectorSearchConfiguration.nonEmpty && f.dimensions.nonEmpty)
+      .map(f => (f.name, f.`type`))
+  }
+  private def makeColsCompatible(vectorColNameTypeTuple: Seq[(String, String)],
+                                               df: DataFrame): DataFrame = {
+    vectorColNameTypeTuple.foldLeft(df) { case (accDF, (colName, colType)) =>
+      if (!accDF.columns.contains(colName)) {
+        println(s"Column $colName is specified in either indexJson or vectorCols but not found in dataframe " +
+          s"columns ${accDF.columns.toList}")
+        accDF
+      }
+      else {
+        val colDataType = accDF.schema(colName).dataType
+        assert(colDataType match {
+          case ArrayType(elementType, _) => elementType == FloatType || elementType == DoubleType
+          case VectorType => true
+          case _ => false
+        }, s"Vector column $colName needs to be one of (ArrayType(FloatType), ArrayType(DoubleType), VectorType)")
+        if (colDataType.isInstanceOf[ArrayType]) {
+          accDF.withColumn(colName, accDF(colName).cast(edmTypeToSparkType(colType, None)))
+        } else {
+          // first cast vectorUDT to array<double>, then cast it to correct array type
+          val modifiedDF = accDF.withColumn(colName, vector_to_array(accDF(colName)))
+          modifiedDF.withColumn(colName, modifiedDF(colName).cast(edmTypeToSparkType(colType, None)))
+        }
+      }
+    }
+  }
+
   private def isEdmCollection(t: String): Boolean = {
     t.startsWith("Collection(") && t.endsWith(")")
   }
@@ -290,6 +369,7 @@ object AzureSearchWriter extends IndexParser with SLogging {
     case "Edm.Int64" => LongType
     case "Edm.Int32" => IntegerType
     case "Edm.Double" => DoubleType
+    case "Edm.Single" => FloatType
     case "Edm.DateTimeOffset" => StringType //See if there's a way to use spark datetimes
     case "Edm.GeographyPoint" => StringType
     case "Edm.ComplexType" => StructType(fields.get.map(f =>
@@ -310,10 +390,12 @@ object AzureSearchWriter extends IndexParser with SLogging {
       case IntegerType => ("Edm.Int32", None)
       case LongType => ("Edm.Int64", None)
       case DoubleType => ("Edm.Double", None)
+      case FloatType  => ("Edm.Single", None)
       case DateType => ("Edm.DateTimeOffset", None)
       case StructType(fields) => ("Edm.ComplexType", Some(fields.map { f =>
         val (innerType, innerFields) = sparkTypeToEdmType(f.dataType)
-        IndexField(f.name, innerType, None, None, None, None, None, None, None, None, None, None, innerFields)
+        IndexField(f.name, innerType, None, None, None, None, None, None, None, None, None, None, innerFields,
+          None, None) // TODO: Support vector search in nested fields
       }))
     }
   }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,3 +86,4 @@ metastore_db/
		*/build/
		*/dist/
		*/.egg-info/*