Skip to content

Commit

Permalink
chore: bump speech sdk version (#2107)
Browse files Browse the repository at this point in the history
* chore: bump speech sdk version

* chore: remove ignored speech tests

* fixes
  • Loading branch information
mhamilton723 authored Oct 30, 2023
1 parent 1af71ed commit c12afc5
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 34 deletions.
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ lazy val cognitive = (project in file("cognitive"))
.dependsOn(core % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies ++= Seq(
"com.microsoft.cognitiveservices.speech" % "client-jar-sdk" % "1.14.0",
"com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.24.1",
"org.apache.hadoop" % "hadoop-common" % "3.3.4" % "test",
"org.apache.hadoop" % "hadoop-azure" % "3.3.4" % "test",
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,26 @@
package com.microsoft.azure.synapse.ml.cognitive.speech

import com.microsoft.azure.synapse.ml.core.env.StreamUtilities.using
import com.microsoft.azure.synapse.ml.io.http.RESTHelpers._
import org.apache.commons.io.IOUtils
import org.apache.http.client.methods.{HttpEntityEnclosingRequestBase, RequestBuilder}
import org.apache.http.entity.mime.content.FileBody
import org.apache.http.entity.mime.{HttpMultipartMode, MultipartEntityBuilder}
import org.apache.http.client.methods.{HttpEntityEnclosingRequestBase, HttpPost}
import org.apache.http.entity.ByteArrayEntity
import spray.json._

import java.io.File
import java.nio.file.Files

object SpeechAPI {

import com.microsoft.azure.synapse.ml.io.http.RESTHelpers._

def getSpeakerProfile(data: File, key: String): String = {
def getSpeakerProfile(data: File, key: String, region: String): String = {
retry(List(100, 500, 1000), { () => //scalastyle:ignore magic.number
val request = RequestBuilder
.post("https://signature.eastus.cts.speech.microsoft.com" +
"/api/v1/Signature/GenerateVoiceSignatureFromFormData")
.setEntity(MultipartEntityBuilder.create()
.setMode(HttpMultipartMode.BROWSER_COMPATIBLE)
.addPart("file", new FileBody(data))
.build())
.addHeader("Ocp-Apim-Subscription-Key", key)
.build()
val httpsURL = s"https://signature.$region.cts.speech.microsoft.com" +
s"/api/v1/Signature/GenerateVoiceSignatureFromByteArray"
val voiceSampleData = Files.readAllBytes(data.toPath)

val request = new HttpPost(httpsURL)
request.setEntity(new ByteArrayEntity(voiceSampleData))
request.addHeader("Ocp-Apim-Subscription-Key", key)

using(Client.execute(request)) { response =>
if (!response.getStatusLine.getStatusCode.toString.startsWith("2")) {
Expand All @@ -45,4 +42,5 @@ object SpeechAPI {
})
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ class ConversationTranscription(override val uid: String) extends SpeechSDKBase
): Iterator[TranscriptionResponse] = {
val speechConfig = getSpeechConfig(uri, speechKey, language, profanity, wordLevelTimestamps, format)
speechConfig.setProperty("ConversationTranscriptionInRoomAndOnline", "true")

speechConfig.setServiceProperty("transcriptionMode",
"RealTimeAndAsync", ServicePropertyChannel.UriQueryParameter)

Expand All @@ -544,14 +545,15 @@ class ConversationTranscription(override val uid: String) extends SpeechSDKBase

val pullStream = getPullStream(stream, audioFormat, defaultAudioFormat)
val audioConfig = AudioConfig.fromStreamInput(pullStream)
audioConfig.setProperty("f0f5debc-f8c9-4892-ac4b-90a7ab359fd2", "true")

val transcriber = new ConversationTranscriber(audioConfig)

// TODO fix this spelling in 1.15 update
conversation.getProperties.setProperty("DifferenciateGuestSpeakers", "true")
conversation.getProperties.setProperty("DifferentiateGuestSpeakers", "true")

transcriber.joinConversationAsync(conversation).get()
val connection = Connection.fromRecognizer(transcriber)

connection.setMessageProperty("speech.config", "application",
s"""{"name":"synapseml", "version": "${BuildInfo.version}"}""")
val queue = new LinkedBlockingQueue[Option[String]]()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ trait TranscriptionSecrets {
Secrets.ConversationTranscriptionUrl)
lazy val conversationTranscriptionKey: String = sys.env.getOrElse("CONVERSATION_TRANSCRIPTION_KEY",
Secrets.ConversationTranscriptionKey)
lazy val conversationTranscriptionRegion: String = "centralus"
}

class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTranscription]
Expand All @@ -341,9 +342,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
}
}

ignore("dialogue with participants") {
val profile1 = SpeechAPI.getSpeakerProfile(audioPaths(4), conversationTranscriptionKey)
val profile2 = SpeechAPI.getSpeakerProfile(audioPaths(5), conversationTranscriptionKey)
test("dialogue with participants") {
val profile1 = SpeechAPI.getSpeakerProfile(
audioPaths(4), conversationTranscriptionKey, conversationTranscriptionRegion)
val profile2 = SpeechAPI.getSpeakerProfile(
audioPaths(5), conversationTranscriptionKey, conversationTranscriptionRegion)
val fromRow = TranscriptionResponse.makeFromRowConverter
val speakers = sdk
.setParticipants(Seq(
Expand All @@ -360,9 +363,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
assert(Seq("user1", "user2").forall(speakers.toSet))
}

ignore("dialogue with participant col") {
val profile1 = SpeechAPI.getSpeakerProfile(audioPaths(4), conversationTranscriptionKey)
val profile2 = SpeechAPI.getSpeakerProfile(audioPaths(5), conversationTranscriptionKey)
test("dialogue with participant col") {
val profile1 = SpeechAPI.getSpeakerProfile(
audioPaths(4), conversationTranscriptionKey, conversationTranscriptionRegion)
val profile2 = SpeechAPI.getSpeakerProfile(
audioPaths(5), conversationTranscriptionKey, conversationTranscriptionRegion)
val participantDf = Seq(
(1, Seq(TranscriptionParticipant("user1", "en-US", profile1),
TranscriptionParticipant("user2", "en-US", profile2))),
Expand All @@ -386,7 +391,7 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
assert(Seq("user1", "user2").forall(speakers.toSet))
}

ignore("dialogue without profiles") {
test("dialogue without profiles") {
val fromRow = TranscriptionResponse.makeFromRowConverter
sdk
.setFileType("mp3").transform(dialogueDf)
Expand All @@ -396,33 +401,33 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
.foreach(println)
}

ignore("Simple SDK Usage Audio 1") {
test("Simple SDK Usage Audio 1") {
dfTest("simple", audioDf1, text1)
}

ignore("Simple SDK Usage Audio 2") {
test("Simple SDK Usage Audio 2") {
dfTest("simple", audioDf2, text2)
}

ignore("Simple SDK Usage without streaming") {
test("Simple SDK Usage without streaming") {
dfTest("simple", audioDf1, text1, sdk = sdk.setStreamIntermediateResults(false))
}

ignore("URI based access") {
test("URI based access") {
val uriDf = Seq(Tuple1(audioPaths(1).toURI.toString))
.toDF("audio")
dfTest("simple", uriDf, text2)
}

ignore("URL based access") {
test("URL based access") {
tryWithRetries(Array(100, 500)) { () => //For handling flaky build machines
val uriDf = Seq(Tuple1("https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav"))
.toDF("audio")
dfTest("simple", uriDf, text2)
}
}

ignore("SAS URL based access") {
test("SAS URL based access") {
val sasURL = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav" +
"?sv=2019-12-12&st=2021-01-25T16%3A40%3A13Z&se=2024-01-26T16%3A40%3A00Z&sr=b&sp=r" +
"&sig=NpFm%2FJemAJOGIya1ykQ6f80YdvwpiAuJjnb2RVDtKro%3D"
Expand All @@ -434,11 +439,11 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
}
}

ignore("Detailed SDK with mp3 (Linux only)") {
test("Detailed SDK with mp3 (Linux only)") {
dfTest("simple", audioDf3, text3, sdk = sdk.setFileType("mp3"), verbose = true, threshold = .6)
}

ignore("m3u8 based access") {
test("m3u8 based access") {
val sdk2 = sdk.setExtraFfmpegArgs(Array("-t", "60"))
.setLanguage("en-US")
// 20 seconds of streaming
Expand All @@ -451,7 +456,7 @@ class ConversationTranscriptionSuite extends TransformerFuzzing[ConversationTran
}
}

ignore("m3u8 file writing") {
test("m3u8 file writing") {
val outputMp3 = new File(savePath, "output.mp3")
val outputJson = new File(savePath, "output.json")

Expand Down

0 comments on commit c12afc5

Please sign in to comment.