From 4ae2df7ce8cd0a0c8272308345bcd900ccddadd6 Mon Sep 17 00:00:00 2001 From: Angelo Paparazzi Date: Wed, 15 May 2024 13:35:46 -0500 Subject: [PATCH] feat(stt): remove interimResults and lowLatency wss params --- lib/recognize-stream.ts | 8 ++------ package-lock.json | 34 +++++++++++++++++----------------- speech-to-text/v1-generated.ts | 4 ++-- speech-to-text/v1.ts | 2 -- 4 files changed, 21 insertions(+), 27 deletions(-) diff --git a/lib/recognize-stream.ts b/lib/recognize-stream.ts index 11eee76d96..e8a71e5517 100644 --- a/lib/recognize-stream.ts +++ b/lib/recognize-stream.ts @@ -1,5 +1,5 @@ /** - * (C) Copyright IBM Corp. 2014, 2020. + * (C) Copyright IBM Corp. 2014, 2024. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,7 +62,7 @@ class RecognizeStream extends Duplex { * * Uses WebSockets under the hood. For audio with no recognizable speech, no `data` events are emitted. * - * By default, only finalized text is emitted in the data events, however when `objectMode`/`readableObjectMode` and `interim_results` are enabled, both interim and final results objects are emitted. + * By default, only finalized text is emitted in the data events, however when `objectMode`/`readableObjectMode` is enabled, both interim and final results objects are emitted. * WriteableElementStream uses this, for example, to live-update the DOM with word-by-word transcriptions. * * Note that the WebSocket connection is not established until the first chunk of data is recieved. This allows for auto-detection of content type (for wav/flac/opus audio). @@ -86,7 +86,6 @@ class RecognizeStream extends Duplex { * @param {string} [options.contentType] - The format (MIME type) of the audio * @param {number} [options.customizationWeight] - Tell the service how much weight to give to words from the custom language model compared to those from the base model for the current request * @param {number} [options.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed (default=30) - * @param {boolean} [options.interimResults] - If true, the service returns interim results as a stream of JSON SpeechRecognitionResults objects (default=false) * @param {string[]} [options.keywords] - An array of keyword strings to spot in the audio * @param {number} [options.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword * @param {number} [options.maxAlternatives] - The maximum number of alternative transcripts that the service is to return (default=1) @@ -105,7 +104,6 @@ class RecognizeStream extends Duplex { * @param {boolean} [options.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into multiple final results based on semantic features of the input * @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform * @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech - * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does * @constructor */ constructor(options: RecognizeStream.Options) { @@ -168,7 +166,6 @@ class RecognizeStream extends Duplex { 'timestamps', 'word_confidence', 'content-type', - 'interim_results', 'keywords', 'keywords_threshold', 'max_alternatives', @@ -182,7 +179,6 @@ class RecognizeStream extends Duplex { 'split_transcript_at_phrase_end', 'speech_detector_sensitivity', 'background_audio_suppression', - 'low_latency', ]; const openingMessage = processUserParameters(options, openingMessageParamsAllowed); openingMessage.action = 'start'; diff --git a/package-lock.json b/package-lock.json index ddb735f930..fa027eb529 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7952,6 +7952,18 @@ "node": ">=8" } }, + "node_modules/jsdoc/node_modules/marked": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", + "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", + "dev": true, + "bin": { + "marked": "bin/marked.js" + }, + "engines": { + "node": ">= 12" + } + }, "node_modules/jsdoc/node_modules/mkdirp": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-1.0.4.tgz", @@ -8615,15 +8627,15 @@ "dev": true }, "node_modules/marked": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", - "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/marked/-/marked-2.1.3.tgz", + "integrity": "sha512-/Q+7MGzaETqifOMWYEA7HVMaZb4XbcRfaOzcSsHZEith83KGlvaSG33u0SKu89Mj5h+T8V2hM+8O45Qc5XTgwA==", "dev": true, "bin": { - "marked": "bin/marked.js" + "marked": "bin/marked" }, "engines": { - "node": ">= 12" + "node": ">= 10" } }, "node_modules/marked-terminal": { @@ -13069,18 +13081,6 @@ "node": ">=10" } }, - "node_modules/semantic-release/node_modules/marked": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/marked/-/marked-2.1.3.tgz", - "integrity": "sha512-/Q+7MGzaETqifOMWYEA7HVMaZb4XbcRfaOzcSsHZEith83KGlvaSG33u0SKu89Mj5h+T8V2hM+8O45Qc5XTgwA==", - "dev": true, - "bin": { - "marked": "bin/marked" - }, - "engines": { - "node": ">= 10" - } - }, "node_modules/semantic-release/node_modules/yargs": { "version": "16.2.0", "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz", diff --git a/speech-to-text/v1-generated.ts b/speech-to-text/v1-generated.ts index 5752f3a281..b8d6cf9474 100644 --- a/speech-to-text/v1-generated.ts +++ b/speech-to-text/v1-generated.ts @@ -6602,13 +6602,13 @@ namespace SpeechToTextV1 { * elements: the word followed by its start and end time in seconds, for example: * `[["hello",0.0,1.2],["world",1.2,2.5]]`. Timestamps are returned only for the best alternative. */ - timestamps?: string[]; + timestamps?: [string, number, number][]; /** A confidence score for each word of the transcript as a list of lists. Each inner list consists of two * elements: the word and its confidence score in the range of 0.0 to 1.0, for example: * `[["hello",0.95],["world",0.86]]`. Confidence scores are returned only for the best alternative and only with * results marked as final. */ - word_confidence?: string[]; + word_confidence?: [string, number][]; } /** Component results for a speech recognition request. */ diff --git a/speech-to-text/v1.ts b/speech-to-text/v1.ts index 380726a585..6fbc167a38 100644 --- a/speech-to-text/v1.ts +++ b/speech-to-text/v1.ts @@ -266,7 +266,6 @@ namespace SpeechToTextV1 { contentType?: string; customizationWeight?: number; inactivityTimeout?: number; - interimResults?: boolean; keywords?: string[]; keywordsThreshold?: number; maxAlternatives?: number; @@ -286,7 +285,6 @@ namespace SpeechToTextV1 { splitTranscriptAtPhraseEnd?: boolean; speechDetectorSensitivity?: number; backgroundAudioSuppression?: number; - lowLatency?: boolean; characterInsertionBias?: number; } }