Replace dual_channel by multi_channel process (#248)

* replace dual_channel by multi_channel process * remove unused functions
Wordcab · Sep 12, 2023 · b117d53 · b117d53
1 parent 1581607
commit b117d53
Show file tree

Hide file tree

Showing 14 changed files with 195 additions and 236 deletions.
diff --git a/README.md b/README.md
@@ -176,7 +176,7 @@ filepath = "/path/to/audio/file.wav"  # or any other convertible format by ffmpe
 data = {
   "num_speakers": -1,  # # Leave at -1 to guess the number of speakers
   "diarization": True,  # Longer processing time but speaker segment attribution
-  "dual_channel": False,  # Only for stereo audio files with one speaker per channel
+  "multi_channel": False,  # Only for stereo audio files with one speaker per channel
   "source_lang": "en",  # optional, default is "en"
   "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
   "word_timestamps": False,  # optional, default is False

diff --git a/notebooks/local_audio_inference.py b/notebooks/local_audio_inference.py
@@ -4,16 +4,17 @@
 
 # filepath = "data/short_one_speaker.mp3"
 # filepath = "data/24118946.mp3"
-filepath = "data/HL_Podcast_1.mp3"
+# filepath = "data/HL_Podcast_1.mp3"
 # filepath = "data/1693323170.139915.139915&delay=10.mp3"
+filepath = "data/2414612_1692939445.333949_2023_08_25_045726_0.mp3"
 
 data = {
-    "offset_start": 23,
-    "offset_end": 186.5,
+    "offset_start": None,
+    "offset_end": None,
     "num_speakers": -1,  # Leave at -1 to guess the number of speakers
     "diarization": True,  # Longer processing time but speaker segment attribution
-    "dual_channel": False,  # Only for stereo audio files with one speaker per channel
-    "source_lang": "en",  # optional, default is "en"
+    "multi_channel": True,  # Only for stereo audio files with one speaker per channel
+    "source_lang": "th",  # optional, default is "en"
     "timestamps": "s",  # optional, default is "s". Can be "s", "ms" or "hms".
     "word_timestamps": False,  # optional, default is False
     "internal_vad": False,

diff --git a/src/wordcab_transcribe/automate.py b/src/wordcab_transcribe/automate.py
@@ -94,7 +94,7 @@ def run_audio_url(
     timestamps: str = "s",
     word_timestamps: bool = False,
     diarization: bool = False,
-    dual_channel: bool = False,
+    multi_channel: bool = False,
     server_url: Optional[str] = None,
     vocab: Optional[List[str]] = None,
     timeout: int = 900,
@@ -108,7 +108,7 @@ def run_audio_url(
         timestamps: time unit of the timestamps (defaulted to seconds)
         word_timestamps: associated words and their timestamps (defaulted to False)
         diarization: speaker labels for utterances (defaulted to False)
-        dual_channel: defaulted to False
+        multi_channel: defaulted to False
         server_url: the URL used to reach out the API
         vocab: defaulted to empty list
         timeout: defaulted to 90 seconds (15 minutes)
@@ -125,7 +125,7 @@ def run_audio_url(
         "source_lang": source_lang,
         "timestamps": timestamps,
         "word_timestamps": word_timestamps,
-        "dual_channel": dual_channel,
+        "multi_channel": multi_channel,
     }
     if vocab:
         data["vocab"] = vocab
@@ -163,7 +163,7 @@ def run_api_audio_file(
     timestamps: str = "s",
     word_timestamps: bool = False,
     diarization: bool = False,
-    dual_channel: bool = False,
+    multi_channel: bool = False,
     server_url: Optional[str] = None,
     vocab: Optional[List[str]] = None,
     timeout: int = 900,
@@ -177,7 +177,7 @@ def run_api_audio_file(
         timestamps: time unit of the timestamps (defaulted to seconds)
         word_timestamps: associated words and their timestamps (defaulted to False)
         diarization: speaker labels for utterances (defaulted to False)
-        dual_channel: defaulted to False
+        multi_channel: defaulted to False
         server_url: the URL used to reach out the API
         vocab: defaulted to empty list
         timeout: defaulted to 90 seconds (15 minutes)
@@ -190,7 +190,7 @@ def run_api_audio_file(
         "source_lang": source_lang,
         "timestamps": timestamps,
         "word_timestamps": word_timestamps,
-        "dual_channel": dual_channel,
+        "multi_channel": multi_channel,
     }
     if vocab:
         data["vocab"] = vocab
@@ -230,7 +230,7 @@ def run_api(
     timestamps: str = "s",
     word_timestamps: bool = False,
     diarization: bool = False,
-    dual_channel: bool = False,
+    multi_channel: bool = False,
     server_url: Optional[str] = None,
     vocab: Optional[List[str]] = None,
     timeout: int = 900,
@@ -245,7 +245,7 @@ def run_api(
         timestamps: time unit of the timestamps (defaulted to seconds)
         word_timestamps: whether the timestamps are represented by words (defaulted to False)
         diarization: speaker labels for utterances (defaulted to False)
-        dual_channel: defaulted to False
+        multi_channel: defaulted to False
         server_url: the URL used to reach out the API
         vocab: defaulted to empty list
         timeout: defaulted to 900 seconds (15 minutes)
@@ -271,7 +271,7 @@ def run_api(
             timestamps,
             word_timestamps,
             diarization,
-            dual_channel,
+            multi_channel,
             server_url,
             vocab,
             timeout,
@@ -283,7 +283,7 @@ def run_api(
             timestamps,
             word_timestamps,
             diarization,
-            dual_channel,
+            multi_channel,
             server_url,
             vocab,
             timeout,

diff --git a/src/wordcab_transcribe/models.py b/src/wordcab_transcribe/models.py
@@ -86,7 +86,7 @@ class BaseResponse(BaseModel):
 class AudioResponse(BaseResponse):
     """Response model for the ASR audio file and url endpoint."""
 
-    dual_channel: bool
+    multi_channel: bool
 
     class Config:
         """Pydantic config class."""
@@ -132,7 +132,7 @@ class Config:
                     "diarization": None,
                     "post_processing": 0.239,
                 },
-                "dual_channel": False,
+                "multi_channel": False,
             }
         }
 
@@ -216,7 +216,7 @@ class CortexPayload(BaseModel):
     offset_end: Optional[float] = None
     num_speakers: Optional[int] = -1
     diarization: Optional[bool] = False
-    dual_channel: Optional[bool] = False
+    multi_channel: Optional[bool] = False
     source_lang: Optional[str] = "en"
     timestamps: Optional[Timestamps] = Timestamps.seconds
     vocab: Optional[List[str]] = []
@@ -242,7 +242,7 @@ class Config:
                 "offset_end": None,
                 "num_speakers": -1,
                 "diarization": False,
-                "dual_channel": False,
+                "multi_channel": False,
                 "source_lang": "en",
                 "timestamps": "s",
                 "vocab": [
@@ -313,7 +313,7 @@ class Config:
                     "diarization": None,
                     "post_processing": 0.239,
                 },
-                "dual_channel": False,
+                "multi_channel": False,
                 "job_name": "job_name",
                 "request_id": "request_id",
             }
@@ -435,7 +435,7 @@ class Config:
 class AudioRequest(BaseRequest):
     """Request model for the ASR audio file and url endpoint."""
 
-    dual_channel: bool = False
+    multi_channel: bool = False
 
     class Config:
         """Pydantic config class."""
@@ -460,7 +460,7 @@ class Config:
                 "log_prob_threshold": -1.0,
                 "no_speech_threshold": 0.6,
                 "condition_on_previous_text": True,
-                "dual_channel": False,
+                "multi_channel": False,
             }
         }
 

diff --git a/src/wordcab_transcribe/router/v1/audio_file_endpoint.py b/src/wordcab_transcribe/router/v1/audio_file_endpoint.py
@@ -31,10 +31,9 @@
 from wordcab_transcribe.models import AudioRequest, AudioResponse
 from wordcab_transcribe.utils import (
     check_num_channels,
-    convert_file_to_wav,
     delete_file,
+    process_audio_file,
     save_file_locally,
-    split_dual_channel_file,
 )
 
 router = APIRouter()
@@ -49,7 +48,7 @@ async def inference_with_audio(  # noqa: C901
     offset_end: float = Form(None),  # noqa: B008
     num_speakers: int = Form(-1),  # noqa: B008
     diarization: bool = Form(False),  # noqa: B008
-    dual_channel: bool = Form(False),  # noqa: B008
+    multi_channel: bool = Form(False),  # noqa: B008
     source_lang: str = Form("en"),  # noqa: B008
     timestamps: str = Form("s"),  # noqa: B008
     vocab: List[str] = Form([]),  # noqa: B008
@@ -87,30 +86,24 @@ async def inference_with_audio(  # noqa: C901
         log_prob_threshold=log_prob_threshold,
         no_speech_threshold=no_speech_threshold,
         condition_on_previous_text=condition_on_previous_text,
-        dual_channel=dual_channel,
+        multi_channel=multi_channel,
     )
 
-    if data.dual_channel:
-        num_channels = await check_num_channels(filename)
+    num_channels = await check_num_channels(filename)
+    print(f"num_channels: {num_channels}")
+    if num_channels > 1 and data.multi_channel is False:
+        num_channels = 1  # Force mono channel if more than 1 channel
 
-        if num_channels == 2:
-            filepath = await split_dual_channel_file(filename)
-            data.dual_channel = True
-        else:
-            logger.error(
-                "Only 1 audio channel detected, fallback to single channel mode."
-            )
-            data.dual_channel = False
-
-    if not data.dual_channel:
-        try:
-            filepath = await convert_file_to_wav(filename)
+    try:
+        filepath: Union[str, List[str]] = await process_audio_file(
+            filename, num_channels=num_channels
+        )
 
-        except Exception as e:
-            raise HTTPException(  # noqa: B904
-                status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail=f"Process failed: {e}",
-            )
+    except Exception as e:
+        raise HTTPException(  # noqa: B904
+            status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Process failed: {e}",
+        )
 
     background_tasks.add_task(delete_file, filepath=filename)
 
@@ -121,7 +114,7 @@ async def inference_with_audio(  # noqa: C901
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
-            dual_channel=data.dual_channel,
+            multi_channel=data.multi_channel,
             source_lang=data.source_lang,
             timestamps_format=data.timestamps,
             vocab=data.vocab,
@@ -153,7 +146,7 @@ async def inference_with_audio(  # noqa: C901
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
-            dual_channel=data.dual_channel,
+            multi_channel=data.multi_channel,
             source_lang=data.source_lang,
             timestamps=data.timestamps,
             vocab=data.vocab,

diff --git a/src/wordcab_transcribe/router/v1/audio_url_endpoint.py b/src/wordcab_transcribe/router/v1/audio_url_endpoint.py
@@ -20,7 +20,7 @@
 """Audio url endpoint for the Wordcab Transcribe API."""
 
 import asyncio
-from typing import Optional
+from typing import List, Optional, Union
 
 import shortuuid
 from fastapi import APIRouter, BackgroundTasks, HTTPException
@@ -31,10 +31,9 @@
 from wordcab_transcribe.models import AudioRequest, AudioResponse
 from wordcab_transcribe.utils import (
     check_num_channels,
-    convert_file_to_wav,
     delete_file,
     download_audio_file,
-    split_dual_channel_file,
+    process_audio_file,
 )
 
 router = APIRouter()
@@ -54,27 +53,20 @@ async def inference_with_audio_url(
     async with download_limit:
         _filepath = await download_audio_file("url", url, filename)
 
-        if data.dual_channel:
-            num_channels = await check_num_channels(filename)
+        num_channels = await check_num_channels(_filepath)
+        if num_channels > 1 and data.multi_channel is False:
+            num_channels = 1  # Force mono channel if more than 1 channel
 
-            if num_channels == 2:
-                filepath = await split_dual_channel_file(filename)
-                data.dual_channel = True
-            else:
-                logger.error(
-                    "Only 1 audio channel detected, fallback to single channel mode."
-                )
-                data.dual_channel = False
-
-        if not data.dual_channel:
-            try:
-                filepath = await convert_file_to_wav(_filepath)
+        try:
+            filepath: Union[str, List[str]] = await process_audio_file(
+                _filepath, num_channels=num_channels
+            )
 
-            except Exception as e:
-                raise HTTPException(  # noqa: B904
-                    status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    detail=f"Process failed: {e}",
-                )
+        except Exception as e:
+            raise HTTPException(  # noqa: B904
+                status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail=f"Process failed: {e}",
+            )
 
         background_tasks.add_task(delete_file, filepath=filename)
 
@@ -85,7 +77,7 @@ async def inference_with_audio_url(
                 offset_end=data.offset_end,
                 num_speakers=data.num_speakers,
                 diarization=data.diarization,
-                dual_channel=data.dual_channel,
+                multi_channel=data.multi_channel,
                 source_lang=data.source_lang,
                 timestamps_format=data.timestamps,
                 vocab=data.vocab,
@@ -117,7 +109,7 @@ async def inference_with_audio_url(
             offset_end=data.offset_end,
             num_speakers=data.num_speakers,
             diarization=data.diarization,
-            dual_channel=data.dual_channel,
+            multi_channel=data.multi_channel,
             source_lang=data.source_lang,
             timestamps=data.timestamps,
             vocab=data.vocab,

diff --git a/src/wordcab_transcribe/router/v1/cortex_endpoint.py b/src/wordcab_transcribe/router/v1/cortex_endpoint.py
@@ -73,7 +73,7 @@ async def run_cortex(
                 offset_end=payload.offset_end,
                 num_speakers=payload.num_speakers,
                 diarization=payload.diarization,
-                dual_channel=payload.dual_channel,
+                multi_channel=payload.multi_channel,
                 source_lang=payload.source_lang,
                 timestamps=payload.timestamps,
                 vocab=payload.vocab,

diff --git a/src/wordcab_transcribe/router/v1/youtube_endpoint.py b/src/wordcab_transcribe/router/v1/youtube_endpoint.py
@@ -55,7 +55,7 @@ async def inference_with_youtube(
                 offset_end=data.offset_end,
                 num_speakers=data.num_speakers,
                 diarization=data.diarization,
-                dual_channel=False,
+                multi_channel=False,
                 source_lang=data.source_lang,
                 timestamps_format=data.timestamps,
                 vocab=data.vocab,