chore: bump speech sdk version (#2107)

* chore: bump speech sdk version * chore: remove ignored speech tests * fixes
microsoft · Oct 30, 2023 · c12afc5 · c12afc5
1 parent 1af71ed
commit c12afc5
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 34 deletions.
diff --git a/build.sbt b/build.sbt
@@ -449,7 +449,7 @@ lazy val cognitive = (project in file("cognitive"))
   .dependsOn(core % "test->test;compile->compile")
   .settings(settings ++ Seq(
     libraryDependencies ++= Seq(
-      "com.microsoft.cognitiveservices.speech" % "client-jar-sdk" % "1.14.0",
+      "com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.24.1",
       "org.apache.hadoop" % "hadoop-common" % "3.3.4" % "test",
       "org.apache.hadoop" % "hadoop-azure" % "3.3.4" % "test",
     ),

diff --git a/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechAPI.scala b/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechAPI.scala
@@ -4,29 +4,26 @@
 package com.microsoft.azure.synapse.ml.cognitive.speech
 
 import com.microsoft.azure.synapse.ml.core.env.StreamUtilities.using
+import com.microsoft.azure.synapse.ml.io.http.RESTHelpers._
 import org.apache.commons.io.IOUtils
-import org.apache.http.client.methods.{HttpEntityEnclosingRequestBase, RequestBuilder}
-import org.apache.http.entity.mime.content.FileBody
-import org.apache.http.entity.mime.{HttpMultipartMode, MultipartEntityBuilder}
+import org.apache.http.client.methods.{HttpEntityEnclosingRequestBase, HttpPost}
+import org.apache.http.entity.ByteArrayEntity
 import spray.json._
 
 import java.io.File
+import java.nio.file.Files
 
 object SpeechAPI {
 
-  import com.microsoft.azure.synapse.ml.io.http.RESTHelpers._
-
-  def getSpeakerProfile(data: File, key: String): String = {
+  def getSpeakerProfile(data: File, key: String, region: String): String = {
     retry(List(100, 500, 1000), { () => //scalastyle:ignore magic.number
-      val request = RequestBuilder
-        .post("https://signature.eastus.cts.speech.microsoft.com" +
-          "/api/v1/Signature/GenerateVoiceSignatureFromFormData")
-        .setEntity(MultipartEntityBuilder.create()
-          .setMode(HttpMultipartMode.BROWSER_COMPATIBLE)
-          .addPart("file", new FileBody(data))
-          .build())
-        .addHeader("Ocp-Apim-Subscription-Key", key)
-        .build()
+      val httpsURL = s"https://signature.$region.cts.speech.microsoft.com" +
+        s"/api/v1/Signature/GenerateVoiceSignatureFromByteArray"
+      val voiceSampleData = Files.readAllBytes(data.toPath)
+
+      val request = new HttpPost(httpsURL)
+      request.setEntity(new ByteArrayEntity(voiceSampleData))
+      request.addHeader("Ocp-Apim-Subscription-Key", key)
 
       using(Client.execute(request)) { response =>
         if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) {
@@ -45,4 +42,5 @@ object SpeechAPI {
     })
   }
 
+
 }
diff --git a/...tive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechToTextSDK.scala b/...tive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechToTextSDK.scala
@@ -531,6 +531,7 @@ class ConversationTranscription(override val uid: String) extends SpeechSDKBase
                        ): Iterator[TranscriptionResponse] = {
     val speechConfig = getSpeechConfig(uri, speechKey, language, profanity, wordLevelTimestamps, format)
     speechConfig.setProperty("ConversationTranscriptionInRoomAndOnline", "true")
+
     speechConfig.setServiceProperty("transcriptionMode",
       "RealTimeAndAsync", ServicePropertyChannel.UriQueryParameter)
 
@@ -544,14 +545,15 @@ class ConversationTranscription(override val uid: String) extends SpeechSDKBase
 
     val pullStream = getPullStream(stream, audioFormat, defaultAudioFormat)
     val audioConfig = AudioConfig.fromStreamInput(pullStream)
+    audioConfig.setProperty("f0f5debc-f8c9-4892-ac4b-90a7ab359fd2", "true")
 
     val transcriber = new ConversationTranscriber(audioConfig)
 
-    // TODO fix this spelling in 1.15 update
-    conversation.getProperties.setProperty("DifferenciateGuestSpeakers", "true")
+    conversation.getProperties.setProperty("DifferentiateGuestSpeakers", "true")
 
     transcriber.joinConversationAsync(conversation).get()
     val connection = Connection.fromRecognizer(transcriber)
+
     connection.setMessageProperty("speech.config", "application",
       s"""{"name":"synapseml", "version": "${BuildInfo.version}"}""")
     val queue = new LinkedBlockingQueue[Option[String]]()

diff --git a/...src/test/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechToTextSDKSuite.scala b/...src/test/scala/com/microsoft/azure/synapse/ml/cognitive/speech/SpeechToTextSDKSuite.scala
@@ -315,6 +315,7 @@ trait TranscriptionSecrets {
     Secrets.ConversationTranscriptionUrl)
   lazy val conversationTranscriptionKey: String = sys.env.getOrElse("CONVERSATION_TRANSCRIPTION_KEY",
     Secrets.ConversationTranscriptionKey)
+  lazy val conversationTranscriptionRegion: String = "centralus"
 }
 
 class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTranscription]
@@ -341,9 +342,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
     }
   }
 
-  ignore("dialogue with participants") {
-    val profile1 = SpeechAPI.getSpeakerProfile(audioPaths(4), conversationTranscriptionKey)
-    val profile2 = SpeechAPI.getSpeakerProfile(audioPaths(5), conversationTranscriptionKey)
+  test("dialogue with participants") {
+    val profile1 = SpeechAPI.getSpeakerProfile(
+      audioPaths(4), conversationTranscriptionKey, conversationTranscriptionRegion)
+    val profile2 = SpeechAPI.getSpeakerProfile(
+      audioPaths(5), conversationTranscriptionKey, conversationTranscriptionRegion)
     val fromRow = TranscriptionResponse.makeFromRowConverter
     val speakers = sdk
       .setParticipants(Seq(
@@ -360,9 +363,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
     assert(Seq("user1", "user2").forall(speakers.toSet))
   }
 
-  ignore("dialogue with participant col") {
-    val profile1 = SpeechAPI.getSpeakerProfile(audioPaths(4), conversationTranscriptionKey)
-    val profile2 = SpeechAPI.getSpeakerProfile(audioPaths(5), conversationTranscriptionKey)
+  test("dialogue with participant col") {
+    val profile1 = SpeechAPI.getSpeakerProfile(
+      audioPaths(4), conversationTranscriptionKey, conversationTranscriptionRegion)
+    val profile2 = SpeechAPI.getSpeakerProfile(
+      audioPaths(5), conversationTranscriptionKey, conversationTranscriptionRegion)
     val participantDf = Seq(
       (1, Seq(TranscriptionParticipant("user1", "en-US", profile1),
         TranscriptionParticipant("user2", "en-US", profile2))),
@@ -386,7 +391,7 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
     assert(Seq("user1", "user2").forall(speakers.toSet))
   }
 
-  ignore("dialogue without profiles") {
+  test("dialogue without profiles") {
     val fromRow = TranscriptionResponse.makeFromRowConverter
     sdk
       .setFileType("mp3").transform(dialogueDf)
@@ -396,33 +401,33 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
       .foreach(println)
   }
 
-  ignore("Simple SDK Usage Audio 1") {
+  test("Simple SDK Usage Audio 1") {
     dfTest("simple", audioDf1, text1)
   }
 
-  ignore("Simple SDK Usage Audio 2") {
+  test("Simple SDK Usage Audio 2") {
     dfTest("simple", audioDf2, text2)
   }
 
-  ignore("Simple SDK Usage without streaming") {
+  test("Simple SDK Usage without streaming") {
     dfTest("simple", audioDf1, text1, sdk = sdk.setStreamIntermediateResults(false))
   }
 
-  ignore("URI based access") {
+  test("URI based access") {
     val uriDf = Seq(Tuple1(audioPaths(1).toURI.toString))
       .toDF("audio")
     dfTest("simple", uriDf, text2)
   }
 
-  ignore("URL based access") {
+  test("URL based access") {
     tryWithRetries(Array(100, 500)) { () => //For handling flaky build machines
       val uriDf = Seq(Tuple1("https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav"))
         .toDF("audio")
       dfTest("simple", uriDf, text2)
     }
   }
 
-  ignore("SAS URL based access") {
+  test("SAS URL based access") {
     val sasURL = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav" +
       "?sv=2019-12-12&st=2021-01-25T16%3A40%3A13Z&se=2024-01-26T16%3A40%3A00Z&sr=b&sp=r" +
       "&sig=NpFm%2FJemAJOGIya1ykQ6f80YdvwpiAuJjnb2RVDtKro%3D"
@@ -434,11 +439,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
     }
   }
 
-  ignore("Detailed SDK with mp3 (Linux only)") {
+  test("Detailed SDK with mp3 (Linux only)") {
     dfTest("simple", audioDf3, text3, sdk = sdk.setFileType("mp3"), verbose = true, threshold = .6)
   }
 
-  ignore("m3u8 based access") {
+  test("m3u8 based access") {
     val sdk2 = sdk.setExtraFfmpegArgs(Array("-t", "60"))
       .setLanguage("en-US")
     // 20 seconds of streaming
@@ -451,7 +456,7 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
     }
   }
 
-  ignore("m3u8 file writing") {
+  test("m3u8 file writing") {
     val outputMp3 = new File(savePath, "output.mp3")
     val outputJson = new File(savePath, "output.json")