From 3217d3dab2c1129094f8f124719ac9e06da41ead Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Sun, 26 May 2024 18:58:49 +0800
Subject: [PATCH 01/11] Fix OpenAI Whisper model error
When using OpenAI Whisper model, an error will happen because words
field is not an Array, which does't have foreach method.
---
client/utils.js | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/client/utils.js b/client/utils.js
index 0cbae17..977ef54 100644
--- a/client/utils.js
+++ b/client/utils.js
@@ -51,7 +51,7 @@ function updateTranscription(transcript_data) {
const transcriptionDiv = document.getElementById('transcription');
const languageDiv = document.getElementById('detected_language');
- if (transcript_data.words && transcript_data.words.length > 0) {
+ if (Array.isArray(transcript_data.words) && transcript_data.words.length > 0) {
// Append words with color based on their probability
transcript_data.words.forEach(wordData => {
const span = document.createElement('span');
@@ -80,6 +80,8 @@ function updateTranscription(transcript_data) {
// Update the language information
if (transcript_data.language && transcript_data.language_probability) {
languageDiv.textContent = transcript_data.language + ' (' + transcript_data.language_probability.toFixed(2) + ')';
+ } else {
+ languageDiv.textContent = 'Not Supported';
}
// Update the processing time, if available
From 8a28ea402639954f538cce13a66d3fed864543ce Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Sun, 26 May 2024 20:28:32 +0800
Subject: [PATCH 02/11] Add new line character for the result from OpenAI
Whisper model
---
client/utils.js | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/client/utils.js b/client/utils.js
index 977ef54..ae8f0b7 100644
--- a/client/utils.js
+++ b/client/utils.js
@@ -74,7 +74,10 @@ function updateTranscription(transcript_data) {
transcriptionDiv.appendChild(document.createElement('br'));
} else {
// Fallback to plain text
- transcriptionDiv.textContent += transcript_data.text + '\n';
+ const span = document.createElement('span');
+ span.textContent = transcript_data.text;
+ transcriptionDiv.appendChild(span);
+ transcriptionDiv.appendChild(document.createElement('br'));
}
// Update the language information
From 0f3430ce5cdde4d025f47d9df7200091dfe6ccca Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Sun, 26 May 2024 22:47:42 +0800
Subject: [PATCH 03/11] Add GPU support for OpenAI Whisper model
---
src/asr/whisper_asr.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/asr/whisper_asr.py b/src/asr/whisper_asr.py
index 2cb05bf..d0d2057 100644
--- a/src/asr/whisper_asr.py
+++ b/src/asr/whisper_asr.py
@@ -1,4 +1,5 @@
import os
+import torch
from transformers import pipeline
@@ -9,9 +10,12 @@
class WhisperASR(ASRInterface):
def __init__(self, **kwargs):
+ device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = kwargs.get("model_name", "openai/whisper-large-v3")
self.asr_pipeline = pipeline(
- "automatic-speech-recognition", model=model_name
+ "automatic-speech-recognition",
+ model=model_name,
+ device=device,
)
async def transcribe(self, client):
From ca7ae89733b69dc0c4b7d7b6d5d749e6f131f60a Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Sun, 26 May 2024 23:17:57 +0800
Subject: [PATCH 04/11] Add logging support
---
src/main.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/src/main.py b/src/main.py
index 0151d1f..fad9c2d 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,6 +1,7 @@
import argparse
import asyncio
import json
+import logging
from src.asr.asr_factory import ASRFactory
from src.vad.vad_factory import VADFactory
@@ -59,12 +60,22 @@ def parse_args():
default=None,
help="The path to the SSL key file if using secure websockets",
)
+ parser.add_argument(
+ "--log-level",
+ type=str,
+ default="error",
+ choices=["debug", "info", "warning", "error"],
+ help="Logging level: debug, info, warning, error. default: error",
+ )
return parser.parse_args()
def main():
args = parse_args()
+ logging.basicConfig()
+ logging.getLogger().setLevel(args.log_level.upper())
+
try:
vad_args = json.loads(args.vad_args)
asr_args = json.loads(args.asr_args)
From e7b9af8369cd6956a70a2cb06ebc9da1e58681e4 Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Tue, 28 May 2024 17:48:37 +0800
Subject: [PATCH 05/11] Fix docs string for save_audio_to_file function
---
src/asr/whisper_asr.py | 2 +-
src/audio_utils.py | 4 +---
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/src/asr/whisper_asr.py b/src/asr/whisper_asr.py
index d0d2057..b472e43 100644
--- a/src/asr/whisper_asr.py
+++ b/src/asr/whisper_asr.py
@@ -1,6 +1,6 @@
import os
-import torch
+import torch
from transformers import pipeline
from src.audio_utils import save_audio_to_file
diff --git a/src/audio_utils.py b/src/audio_utils.py
index 9ee4c7d..f9aa203 100644
--- a/src/audio_utils.py
+++ b/src/audio_utils.py
@@ -8,10 +8,8 @@ async def save_audio_to_file(
"""
Saves the audio data to a file.
- :param client_id: Unique identifier for the client.
:param audio_data: The audio data to save.
- :param file_counters: Dictionary to keep track of file counts for each
- client.
+ :param file_name: The name of the file.
:param audio_dir: Directory where audio files will be saved.
:param audio_format: Format of the audio file.
:return: Path to the saved audio file.
From dc6feed01329776b89007a36edea37b548280329 Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Tue, 28 May 2024 17:50:17 +0800
Subject: [PATCH 06/11] Add audio files directory to .gitignore
---
.gitignore | 3 +++
1 file changed, 3 insertions(+)
diff --git a/.gitignore b/.gitignore
index ad4a1f1..d0a976e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,7 @@ poetry.toml
# LSP config files
pyrightconfig.json
+# audio files created by development server
+audio_files
+
# End of https://www.toptal.com/developers/gitignore/api/python
From d7b61ce0f10829e6a6c4ef50f1ae49a8197130f9 Mon Sep 17 00:00:00 2001
From: Jinmiao Luo <39730824+jinmiaoluo@users.noreply.github.com>
Date: Thu, 30 May 2024 18:57:25 +0800
Subject: [PATCH 07/11] Refactor JavaScript code
---
client/index.html | 8 +++---
client/utils.js | 64 +++++++++++++++++++++++------------------------
2 files changed, 35 insertions(+), 37 deletions(-)
diff --git a/client/index.html b/client/index.html
index f17860c..7807e86 100644
--- a/client/index.html
+++ b/client/index.html
@@ -75,7 +75,7 @@
display: none;
}
-
+
Transcribe a Web Audio Stream with Huggingface VAD + Whisper
@@ -132,11 +132,11 @@ Transcribe a Web Audio Stream with Huggingface VAD + Whisper
-
+
-