From 3217d3dab2c1129094f8f124719ac9e06da41ead Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 26 May 2024 18:58:49 +0800 Subject: [PATCH 01/11] Fix OpenAI Whisper model error When using OpenAI Whisper model, an error will happen because words field is not an Array, which does't have foreach method. --- client/utils.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/client/utils.js b/client/utils.js index 0cbae17..977ef54 100644 --- a/client/utils.js +++ b/client/utils.js @@ -51,7 +51,7 @@ function updateTranscription(transcript_data) { const transcriptionDiv = document.getElementById('transcription'); const languageDiv = document.getElementById('detected_language'); - if (transcript_data.words && transcript_data.words.length > 0) { + if (Array.isArray(transcript_data.words) && transcript_data.words.length > 0) { // Append words with color based on their probability transcript_data.words.forEach(wordData => { const span = document.createElement('span'); @@ -80,6 +80,8 @@ function updateTranscription(transcript_data) { // Update the language information if (transcript_data.language && transcript_data.language_probability) { languageDiv.textContent = transcript_data.language + ' (' + transcript_data.language_probability.toFixed(2) + ')'; + } else { + languageDiv.textContent = 'Not Supported'; } // Update the processing time, if available From 8a28ea402639954f538cce13a66d3fed864543ce Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 26 May 2024 20:28:32 +0800 Subject: [PATCH 02/11] Add new line character for the result from OpenAI Whisper model --- client/utils.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/client/utils.js b/client/utils.js index 977ef54..ae8f0b7 100644 --- a/client/utils.js +++ b/client/utils.js @@ -74,7 +74,10 @@ function updateTranscription(transcript_data) { transcriptionDiv.appendChild(document.createElement('br')); } else { // Fallback to plain text - transcriptionDiv.textContent += transcript_data.text + '\n'; + const span = document.createElement('span'); + span.textContent = transcript_data.text; + transcriptionDiv.appendChild(span); + transcriptionDiv.appendChild(document.createElement('br')); } // Update the language information From 0f3430ce5cdde4d025f47d9df7200091dfe6ccca Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 26 May 2024 22:47:42 +0800 Subject: [PATCH 03/11] Add GPU support for OpenAI Whisper model --- src/asr/whisper_asr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/asr/whisper_asr.py b/src/asr/whisper_asr.py index 2cb05bf..d0d2057 100644 --- a/src/asr/whisper_asr.py +++ b/src/asr/whisper_asr.py @@ -1,4 +1,5 @@ import os +import torch from transformers import pipeline @@ -9,9 +10,12 @@ class WhisperASR(ASRInterface): def __init__(self, **kwargs): + device = "cuda" if torch.cuda.is_available() else "cpu" model_name = kwargs.get("model_name", "openai/whisper-large-v3") self.asr_pipeline = pipeline( - "automatic-speech-recognition", model=model_name + "automatic-speech-recognition", + model=model_name, + device=device, ) async def transcribe(self, client): From ca7ae89733b69dc0c4b7d7b6d5d749e6f131f60a Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 26 May 2024 23:17:57 +0800 Subject: [PATCH 04/11] Add logging support --- src/main.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/main.py b/src/main.py index 0151d1f..fad9c2d 100644 --- a/src/main.py +++ b/src/main.py @@ -1,6 +1,7 @@ import argparse import asyncio import json +import logging from src.asr.asr_factory import ASRFactory from src.vad.vad_factory import VADFactory @@ -59,12 +60,22 @@ def parse_args(): default=None, help="The path to the SSL key file if using secure websockets", ) + parser.add_argument( + "--log-level", + type=str, + default="error", + choices=["debug", "info", "warning", "error"], + help="Logging level: debug, info, warning, error. default: error", + ) return parser.parse_args() def main(): args = parse_args() + logging.basicConfig() + logging.getLogger().setLevel(args.log_level.upper()) + try: vad_args = json.loads(args.vad_args) asr_args = json.loads(args.asr_args) From e7b9af8369cd6956a70a2cb06ebc9da1e58681e4 Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Tue, 28 May 2024 17:48:37 +0800 Subject: [PATCH 05/11] Fix docs string for save_audio_to_file function --- src/asr/whisper_asr.py | 2 +- src/audio_utils.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/asr/whisper_asr.py b/src/asr/whisper_asr.py index d0d2057..b472e43 100644 --- a/src/asr/whisper_asr.py +++ b/src/asr/whisper_asr.py @@ -1,6 +1,6 @@ import os -import torch +import torch from transformers import pipeline from src.audio_utils import save_audio_to_file diff --git a/src/audio_utils.py b/src/audio_utils.py index 9ee4c7d..f9aa203 100644 --- a/src/audio_utils.py +++ b/src/audio_utils.py @@ -8,10 +8,8 @@ async def save_audio_to_file( """ Saves the audio data to a file. - :param client_id: Unique identifier for the client. :param audio_data: The audio data to save. - :param file_counters: Dictionary to keep track of file counts for each - client. + :param file_name: The name of the file. :param audio_dir: Directory where audio files will be saved. :param audio_format: Format of the audio file. :return: Path to the saved audio file. From dc6feed01329776b89007a36edea37b548280329 Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Tue, 28 May 2024 17:50:17 +0800 Subject: [PATCH 06/11] Add audio files directory to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index ad4a1f1..d0a976e 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,7 @@ poetry.toml # LSP config files pyrightconfig.json +# audio files created by development server +audio_files + # End of https://www.toptal.com/developers/gitignore/api/python From d7b61ce0f10829e6a6c4ef50f1ae49a8197130f9 Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Thu, 30 May 2024 18:57:25 +0800 Subject: [PATCH 07/11] Refactor JavaScript code --- client/index.html | 8 +++--- client/utils.js | 64 +++++++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/client/index.html b/client/index.html index f17860c..7807e86 100644 --- a/client/index.html +++ b/client/index.html @@ -75,7 +75,7 @@ display: none; } - +

Transcribe a Web Audio Stream with Huggingface VAD + Whisper

@@ -132,11 +132,11 @@

Transcribe a Web Audio Stream with Huggingface VAD + Whisper

- + - -

diff --git a/client/utils.js b/client/utils.js index ae8f0b7..986da93 100644 --- a/client/utils.js +++ b/client/utils.js @@ -10,17 +10,24 @@ let context; let processor; let globalStream; let language; - -const bufferSize = 4096; let isRecording = false; -function initWebSocket() { - const websocketAddress = document.getElementById('websocketAddress'); - const selectedLanguage = document.getElementById('languageSelect'); - const websocketStatus = document.getElementById('webSocketStatus'); - const startButton = document.getElementById('startButton'); - const stopButton = document.getElementById('stopButton'); - +const bufferSize = 4096; +const websocketAddress = document.querySelector('#websocketAddress'); +const selectedLanguage = document.querySelector('#languageSelect'); +const websocketStatus = document.querySelector('#webSocketStatus'); +const connectButton = document.querySelector("#connectButton"); +const startButton = document.querySelector('#startButton'); +const stopButton = document.querySelector('#stopButton'); +const transcriptionDiv = document.querySelector('#transcription'); +const languageDiv = document.querySelector('#detected_language'); +const processingTimeDiv = document.querySelector('#processing_time'); +const panel = document.querySelector('#silence_at_end_of_chunk_options_panel'); +const selectedStrategy = document.querySelector('#bufferingStrategySelect'); +const chunk_length_seconds = document.querySelector('#chunk_length_seconds'); +const chunk_offset_seconds = document.querySelector('#chunk_offset_seconds'); + +connectButton.addEventListener("click", () => { language = selectedLanguage.value !== 'multilingual' ? selectedLanguage.value : null; if (!websocketAddress.value) { @@ -45,12 +52,9 @@ function initWebSocket() { const transcript_data = JSON.parse(event.data); updateTranscription(transcript_data); }; -} +}) function updateTranscription(transcript_data) { - const transcriptionDiv = document.getElementById('transcription'); - const languageDiv = document.getElementById('detected_language'); - if (Array.isArray(transcript_data.words) && transcript_data.words.length > 0) { // Append words with color based on their probability transcript_data.words.forEach(wordData => { @@ -88,21 +92,18 @@ function updateTranscription(transcript_data) { } // Update the processing time, if available - const processingTimeDiv = document.getElementById('processing_time'); if (transcript_data.processing_time) { processingTimeDiv.textContent = 'Processing time: ' + transcript_data.processing_time.toFixed(2) + ' seconds'; } } - -function startRecording() { +startButton.addEventListener("click", () => { if (isRecording) return; isRecording = true; - const AudioContext = window.AudioContext || window.webkitAudioContext; context = new AudioContext(); - navigator.mediaDevices.getUserMedia({audio: true}).then(stream => { + navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => { globalStream = stream; const input = context.createMediaStreamSource(stream); processor = context.createScriptProcessor(bufferSize, 1, 1); @@ -115,11 +116,11 @@ function startRecording() { }).catch(error => console.error('Error accessing microphone', error)); // Disable start button and enable stop button - document.getElementById('startButton').disabled = true; - document.getElementById('stopButton').disabled = false; -} + startButton.disabled = true; + stopButton.disabled = false; +}) -function stopRecording() { +stopButton.addEventListener("click", () => { if (!isRecording) return; isRecording = false; @@ -133,18 +134,17 @@ function stopRecording() { if (context) { context.close().then(() => context = null); } - document.getElementById('startButton').disabled = false; - document.getElementById('stopButton').disabled = true; -} + startButton.disabled = false; + stopButton.disabled = true; +}) function sendAudioConfig() { - let selectedStrategy = document.getElementById('bufferingStrategySelect').value; let processingArgs = {}; - if (selectedStrategy === 'silence_at_end_of_chunk') { + if (selectedStrategy.value === 'silence_at_end_of_chunk') { processingArgs = { - chunk_length_seconds: parseFloat(document.getElementById('chunk_length_seconds').value), - chunk_offset_seconds: parseFloat(document.getElementById('chunk_offset_seconds').value) + chunk_length_seconds: parseFloat(chunk_length_seconds.value), + chunk_offset_seconds: parseFloat(chunk_offset_seconds.value) }; } @@ -155,7 +155,7 @@ function sendAudioConfig() { bufferSize: bufferSize, channels: 1, // Assuming mono channel language: language, - processing_strategy: selectedStrategy, + processing_strategy: selectedStrategy.value, processing_args: processingArgs } }; @@ -212,9 +212,7 @@ function convertFloat32ToInt16(buffer) { // window.onload = initWebSocket; function toggleBufferingStrategyPanel() { - let selectedStrategy = document.getElementById('bufferingStrategySelect').value; - let panel = document.getElementById('silence_at_end_of_chunk_options_panel'); - if (selectedStrategy === 'silence_at_end_of_chunk') { + if (selectedStrategy.value === 'silence_at_end_of_chunk') { panel.classList.remove('hidden'); } else { panel.classList.add('hidden'); From 368b78c46d67c4130a4acf841b575b5da742e2de Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Thu, 30 May 2024 20:11:31 +0800 Subject: [PATCH 08/11] Replace ScriptProcessorNode with AudioWorkletNode The ScriptProcessorNode is deprecated and should be replaced with AudioWorkletNode. An AudioWorkletNode can be created with a specific AudioWorkletProcessor. The AudioWorkletProcessor parses audio data from the source node and uses a message port to send data back to the corresponding AudioWorkletNode. Each piece of audio data is a Float32Array containing 128 elements. --- client/realtime-audio-processor.js | 13 +++++ client/utils.js | 77 +++++++++++++++++++----------- 2 files changed, 62 insertions(+), 28 deletions(-) create mode 100644 client/realtime-audio-processor.js diff --git a/client/realtime-audio-processor.js b/client/realtime-audio-processor.js new file mode 100644 index 0000000..b925913 --- /dev/null +++ b/client/realtime-audio-processor.js @@ -0,0 +1,13 @@ +class RealtimeAudioProcessor extends AudioWorkletProcessor { + constructor(options) { + super(); + } + + process(inputs, outputs, params) { + // ASR and VAD models typically require a mono audio. + this.port.postMessage(inputs[0][0]); + return true; + } +} + +registerProcessor('realtime-audio-processor', RealtimeAudioProcessor); diff --git a/client/utils.js b/client/utils.js index 986da93..92d09d4 100644 --- a/client/utils.js +++ b/client/utils.js @@ -12,7 +12,6 @@ let globalStream; let language; let isRecording = false; -const bufferSize = 4096; const websocketAddress = document.querySelector('#websocketAddress'); const selectedLanguage = document.querySelector('#languageSelect'); const websocketStatus = document.querySelector('#webSocketStatus'); @@ -52,7 +51,7 @@ connectButton.addEventListener("click", () => { const transcript_data = JSON.parse(event.data); updateTranscription(transcript_data); }; -}) +}); function updateTranscription(transcript_data) { if (Array.isArray(transcript_data.words) && transcript_data.words.length > 0) { @@ -102,23 +101,41 @@ startButton.addEventListener("click", () => { isRecording = true; context = new AudioContext(); - - navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => { + let onSuccess = async (stream) => { globalStream = stream; const input = context.createMediaStreamSource(stream); - processor = context.createScriptProcessor(bufferSize, 1, 1); - processor.onaudioprocess = e => processAudio(e); - - // chain up the audio graph - input.connect(processor).connect(context.destination); - + const recordingNode = await setupRecordingWorkletNode(); + recordingNode.port.onmessage = (event) => { + processAudio(event.data); + }; + input.connect(recordingNode); sendAudioConfig(); - }).catch(error => console.error('Error accessing microphone', error)); + }; + let onError = (error) => { + console.error(error); + }; + navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + autoGainControl: false, + noiseSuppression: true, + latency: 0 + } + }).then(onSuccess, onError); // Disable start button and enable stop button startButton.disabled = true; stopButton.disabled = false; -}) +}); + +async function setupRecordingWorkletNode() { + await context.audioWorklet.addModule('realtime-audio-processor.js'); + + return new AudioWorkletNode( + context, + 'realtime-audio-processor' + ); +} stopButton.addEventListener("click", () => { if (!isRecording) return; @@ -136,7 +153,7 @@ stopButton.addEventListener("click", () => { } startButton.disabled = false; stopButton.disabled = true; -}) +}); function sendAudioConfig() { let processingArgs = {}; @@ -152,7 +169,6 @@ function sendAudioConfig() { type: 'config', data: { sampleRate: context.sampleRate, - bufferSize: bufferSize, channels: 1, // Assuming mono channel language: language, processing_strategy: selectedStrategy.value, @@ -163,7 +179,25 @@ function sendAudioConfig() { websocket.send(JSON.stringify(audioConfig)); } -function downsampleBuffer(buffer, inputSampleRate, outputSampleRate) { +function processAudio(sampleData) { + const inputSampleRate = context.sampleRate; + const outputSampleRate = 16000; + + // ASR (Automatic Speech Recognition) and VAD (Voice Activity Detection) + // models typically require mono audio with a sampling rate of 16 kHz, + // represented as a signed int16 array type. + // + // Implementing changes to the sampling rate using JavaScript can reduce + // computational costs on the server. + const decreaseResultBuffer = decreaseSampleRate(sampleData, inputSampleRate, outputSampleRate); + const audioData = convertFloat32ToInt16(decreaseResultBuffer); + + if (websocket && websocket.readyState === WebSocket.OPEN) { + websocket.send(audioData); + } +} + +function decreaseSampleRate(buffer, inputSampleRate, outputSampleRate) { if (inputSampleRate === outputSampleRate) { return buffer; } @@ -186,19 +220,6 @@ function downsampleBuffer(buffer, inputSampleRate, outputSampleRate) { return result; } -function processAudio(e) { - const inputSampleRate = context.sampleRate; - const outputSampleRate = 16000; // Target sample rate - - const left = e.inputBuffer.getChannelData(0); - const downsampledBuffer = downsampleBuffer(left, inputSampleRate, outputSampleRate); - const audioData = convertFloat32ToInt16(downsampledBuffer); - - if (websocket && websocket.readyState === WebSocket.OPEN) { - websocket.send(audioData); - } -} - function convertFloat32ToInt16(buffer) { let l = buffer.length; const buf = new Int16Array(l); From b7e3ea7865167c5b66463b8f41fd38b9040e8e50 Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 2 Jun 2024 10:57:04 +0800 Subject: [PATCH 09/11] Update UI Interactive Logic When user has connected to the WebSocket server using the address from the input element, the connectButton should be disabled. When user updates the WebSocket address, the connectButton should be re-enabled. If the user is recording, it should be stopped. If a connection has been established, it should be closed. When user types 'Enter' in the input element, it should trigger the 'connect WebSocket' operation for better UI interaction. --- client/utils.js | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/client/utils.js b/client/utils.js index 92d09d4..a19915a 100644 --- a/client/utils.js +++ b/client/utils.js @@ -26,7 +26,28 @@ const selectedStrategy = document.querySelector('#bufferingStrategySelect'); const chunk_length_seconds = document.querySelector('#chunk_length_seconds'); const chunk_offset_seconds = document.querySelector('#chunk_offset_seconds'); -connectButton.addEventListener("click", () => { +websocketAddress.addEventListener("input", resetWebsocketHandler); + +websocketAddress.addEventListener("keydown", (event) => { + if (event.key === 'Enter') { + event.preventDefault(); + connectWebsocketHandler(); + } +}); + +connectButton.addEventListener("click", connectWebsocketHandler); + +function resetWebsocketHandler() { + if (isRecording) { + stopRecordingHandler(); + } + if (websocket.readyState === WebSocket.OPEN) { + websocket.close(); + } + connectButton.disabled = false; +} + +function connectWebsocketHandler() { language = selectedLanguage.value !== 'multilingual' ? selectedLanguage.value : null; if (!websocketAddress.value) { @@ -39,19 +60,21 @@ connectButton.addEventListener("click", () => { console.log("WebSocket connection established"); websocketStatus.textContent = 'Connected'; startButton.disabled = false; + connectButton.disabled = true; }; websocket.onclose = event => { console.log("WebSocket connection closed", event); websocketStatus.textContent = 'Not Connected'; startButton.disabled = true; stopButton.disabled = true; + connectButton.disabled = false; }; websocket.onmessage = event => { console.log("Message from server:", event.data); const transcript_data = JSON.parse(event.data); updateTranscription(transcript_data); }; -}); +} function updateTranscription(transcript_data) { if (Array.isArray(transcript_data.words) && transcript_data.words.length > 0) { @@ -96,7 +119,9 @@ function updateTranscription(transcript_data) { } } -startButton.addEventListener("click", () => { +startButton.addEventListener("click", startRecordingHandler); + +function startRecordingHandler() { if (isRecording) return; isRecording = true; @@ -126,7 +151,7 @@ startButton.addEventListener("click", () => { // Disable start button and enable stop button startButton.disabled = true; stopButton.disabled = false; -}); +} async function setupRecordingWorkletNode() { await context.audioWorklet.addModule('realtime-audio-processor.js'); @@ -137,7 +162,9 @@ async function setupRecordingWorkletNode() { ); } -stopButton.addEventListener("click", () => { +stopButton.addEventListener("click", stopRecordingHandler); + +function stopRecordingHandler() { if (!isRecording) return; isRecording = false; @@ -153,7 +180,7 @@ stopButton.addEventListener("click", () => { } startButton.disabled = false; stopButton.disabled = true; -}); +} function sendAudioConfig() { let processingArgs = {}; From 723e46f186a669f5ea3ae3d6e33b29cdc8ef8b8d Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 2 Jun 2024 12:53:22 +0800 Subject: [PATCH 10/11] Update the execution order of the audio configuration sending logic We shall push user config to server before handling any audio data. --- client/utils.js | 11 ++++++----- src/server.py | 2 ++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/client/utils.js b/client/utils.js index a19915a..a038e3f 100644 --- a/client/utils.js +++ b/client/utils.js @@ -9,7 +9,6 @@ let websocket; let context; let processor; let globalStream; -let language; let isRecording = false; const websocketAddress = document.querySelector('#websocketAddress'); @@ -48,8 +47,6 @@ function resetWebsocketHandler() { } function connectWebsocketHandler() { - language = selectedLanguage.value !== 'multilingual' ? selectedLanguage.value : null; - if (!websocketAddress.value) { console.log("WebSocket address is required."); return; @@ -126,7 +123,12 @@ function startRecordingHandler() { isRecording = true; context = new AudioContext(); + let onSuccess = async (stream) => { + // Push user config to server + let language = selectedLanguage.value !== 'multilingual' ? selectedLanguage.value : null; + sendAudioConfig(language); + globalStream = stream; const input = context.createMediaStreamSource(stream); const recordingNode = await setupRecordingWorkletNode(); @@ -134,7 +136,6 @@ function startRecordingHandler() { processAudio(event.data); }; input.connect(recordingNode); - sendAudioConfig(); }; let onError = (error) => { console.error(error); @@ -182,7 +183,7 @@ function stopRecordingHandler() { stopButton.disabled = true; } -function sendAudioConfig() { +function sendAudioConfig(language) { let processingArgs = {}; if (selectedStrategy.value === 'silence_at_end_of_chunk') { diff --git a/src/server.py b/src/server.py index 00932df..809c7ad 100644 --- a/src/server.py +++ b/src/server.py @@ -1,4 +1,5 @@ import json +import logging import ssl import uuid @@ -57,6 +58,7 @@ async def handle_audio(self, client, websocket): config = json.loads(message) if config.get("type") == "config": client.update_config(config["data"]) + logging.debug(f"Updated config: {client.config}") continue else: print(f"Unexpected message type from {client.client_id}") From ae74f7d767fc717bbf25bfbc9b22dd3b6ffb842e Mon Sep 17 00:00:00 2001 From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com> Date: Sun, 2 Jun 2024 18:53:21 +0800 Subject: [PATCH 11/11] Refactor the decreaseSampleRate function Use `Math.ceil()` instead of `Math.round()` to ensure there is enough space for all samples. --- client/utils.js | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/client/utils.js b/client/utils.js index a038e3f..812adc3 100644 --- a/client/utils.js +++ b/client/utils.js @@ -197,7 +197,7 @@ function sendAudioConfig(language) { type: 'config', data: { sampleRate: context.sampleRate, - channels: 1, // Assuming mono channel + channels: 1, language: language, processing_strategy: selectedStrategy.value, processing_args: processingArgs @@ -208,16 +208,14 @@ function sendAudioConfig(language) { } function processAudio(sampleData) { - const inputSampleRate = context.sampleRate; - const outputSampleRate = 16000; - // ASR (Automatic Speech Recognition) and VAD (Voice Activity Detection) // models typically require mono audio with a sampling rate of 16 kHz, // represented as a signed int16 array type. // // Implementing changes to the sampling rate using JavaScript can reduce // computational costs on the server. - const decreaseResultBuffer = decreaseSampleRate(sampleData, inputSampleRate, outputSampleRate); + const outputSampleRate = 16000; + const decreaseResultBuffer = decreaseSampleRate(sampleData, context.sampleRate, outputSampleRate); const audioData = convertFloat32ToInt16(decreaseResultBuffer); if (websocket && websocket.readyState === WebSocket.OPEN) { @@ -226,11 +224,15 @@ function processAudio(sampleData) { } function decreaseSampleRate(buffer, inputSampleRate, outputSampleRate) { - if (inputSampleRate === outputSampleRate) { - return buffer; + if (inputSampleRate < outputSampleRate) { + console.error("Sample rate too small."); + return; + } else if (inputSampleRate === outputSampleRate) { + return; } + let sampleRateRatio = inputSampleRate / outputSampleRate; - let newLength = Math.round(buffer.length / sampleRateRatio); + let newLength = Math.ceil(buffer.length / sampleRateRatio); let result = new Float32Array(newLength); let offsetResult = 0; let offsetBuffer = 0;