Skip to content

Commit

Permalink
Replace dual_channel by multi_channel process (#248)
Browse files Browse the repository at this point in the history
* replace dual_channel by multi_channel process

* remove unused functions
  • Loading branch information
Thomas Chaigneau authored Sep 12, 2023
1 parent 1581607 commit b117d53
Show file tree
Hide file tree
Showing 14 changed files with 195 additions and 236 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ filepath = "/path/to/audio/file.wav" # or any other convertible format by ffmpe
data = {
"num_speakers": -1, # # Leave at -1 to guess the number of speakers
"diarization": True, # Longer processing time but speaker segment attribution
"dual_channel": False, # Only for stereo audio files with one speaker per channel
"multi_channel": False, # Only for stereo audio files with one speaker per channel
"source_lang": "en", # optional, default is "en"
"timestamps": "s", # optional, default is "s". Can be "s", "ms" or "hms".
"word_timestamps": False, # optional, default is False
Expand Down
11 changes: 6 additions & 5 deletions notebooks/local_audio_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@

# filepath = "data/short_one_speaker.mp3"
# filepath = "data/24118946.mp3"
filepath = "data/HL_Podcast_1.mp3"
# filepath = "data/HL_Podcast_1.mp3"
# filepath = "data/1693323170.139915.139915&delay=10.mp3"
filepath = "data/2414612_1692939445.333949_2023_08_25_045726_0.mp3"

data = {
"offset_start": 23,
"offset_end": 186.5,
"offset_start": None,
"offset_end": None,
"num_speakers": -1, # Leave at -1 to guess the number of speakers
"diarization": True, # Longer processing time but speaker segment attribution
"dual_channel": False, # Only for stereo audio files with one speaker per channel
"source_lang": "en", # optional, default is "en"
"multi_channel": True, # Only for stereo audio files with one speaker per channel
"source_lang": "th", # optional, default is "en"
"timestamps": "s", # optional, default is "s". Can be "s", "ms" or "hms".
"word_timestamps": False, # optional, default is False
"internal_vad": False,
Expand Down
20 changes: 10 additions & 10 deletions src/wordcab_transcribe/automate.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def run_audio_url(
timestamps: str = "s",
word_timestamps: bool = False,
diarization: bool = False,
dual_channel: bool = False,
multi_channel: bool = False,
server_url: Optional[str] = None,
vocab: Optional[List[str]] = None,
timeout: int = 900,
Expand All @@ -108,7 +108,7 @@ def run_audio_url(
timestamps: time unit of the timestamps (defaulted to seconds)
word_timestamps: associated words and their timestamps (defaulted to False)
diarization: speaker labels for utterances (defaulted to False)
dual_channel: defaulted to False
multi_channel: defaulted to False
server_url: the URL used to reach out the API
vocab: defaulted to empty list
timeout: defaulted to 90 seconds (15 minutes)
Expand All @@ -125,7 +125,7 @@ def run_audio_url(
"source_lang": source_lang,
"timestamps": timestamps,
"word_timestamps": word_timestamps,
"dual_channel": dual_channel,
"multi_channel": multi_channel,
}
if vocab:
data["vocab"] = vocab
Expand Down Expand Up @@ -163,7 +163,7 @@ def run_api_audio_file(
timestamps: str = "s",
word_timestamps: bool = False,
diarization: bool = False,
dual_channel: bool = False,
multi_channel: bool = False,
server_url: Optional[str] = None,
vocab: Optional[List[str]] = None,
timeout: int = 900,
Expand All @@ -177,7 +177,7 @@ def run_api_audio_file(
timestamps: time unit of the timestamps (defaulted to seconds)
word_timestamps: associated words and their timestamps (defaulted to False)
diarization: speaker labels for utterances (defaulted to False)
dual_channel: defaulted to False
multi_channel: defaulted to False
server_url: the URL used to reach out the API
vocab: defaulted to empty list
timeout: defaulted to 90 seconds (15 minutes)
Expand All @@ -190,7 +190,7 @@ def run_api_audio_file(
"source_lang": source_lang,
"timestamps": timestamps,
"word_timestamps": word_timestamps,
"dual_channel": dual_channel,
"multi_channel": multi_channel,
}
if vocab:
data["vocab"] = vocab
Expand Down Expand Up @@ -230,7 +230,7 @@ def run_api(
timestamps: str = "s",
word_timestamps: bool = False,
diarization: bool = False,
dual_channel: bool = False,
multi_channel: bool = False,
server_url: Optional[str] = None,
vocab: Optional[List[str]] = None,
timeout: int = 900,
Expand All @@ -245,7 +245,7 @@ def run_api(
timestamps: time unit of the timestamps (defaulted to seconds)
word_timestamps: whether the timestamps are represented by words (defaulted to False)
diarization: speaker labels for utterances (defaulted to False)
dual_channel: defaulted to False
multi_channel: defaulted to False
server_url: the URL used to reach out the API
vocab: defaulted to empty list
timeout: defaulted to 900 seconds (15 minutes)
Expand All @@ -271,7 +271,7 @@ def run_api(
timestamps,
word_timestamps,
diarization,
dual_channel,
multi_channel,
server_url,
vocab,
timeout,
Expand All @@ -283,7 +283,7 @@ def run_api(
timestamps,
word_timestamps,
diarization,
dual_channel,
multi_channel,
server_url,
vocab,
timeout,
Expand Down
14 changes: 7 additions & 7 deletions src/wordcab_transcribe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class BaseResponse(BaseModel):
class AudioResponse(BaseResponse):
"""Response model for the ASR audio file and url endpoint."""

dual_channel: bool
multi_channel: bool

class Config:
"""Pydantic config class."""
Expand Down Expand Up @@ -132,7 +132,7 @@ class Config:
"diarization": None,
"post_processing": 0.239,
},
"dual_channel": False,
"multi_channel": False,
}
}

Expand Down Expand Up @@ -216,7 +216,7 @@ class CortexPayload(BaseModel):
offset_end: Optional[float] = None
num_speakers: Optional[int] = -1
diarization: Optional[bool] = False
dual_channel: Optional[bool] = False
multi_channel: Optional[bool] = False
source_lang: Optional[str] = "en"
timestamps: Optional[Timestamps] = Timestamps.seconds
vocab: Optional[List[str]] = []
Expand All @@ -242,7 +242,7 @@ class Config:
"offset_end": None,
"num_speakers": -1,
"diarization": False,
"dual_channel": False,
"multi_channel": False,
"source_lang": "en",
"timestamps": "s",
"vocab": [
Expand Down Expand Up @@ -313,7 +313,7 @@ class Config:
"diarization": None,
"post_processing": 0.239,
},
"dual_channel": False,
"multi_channel": False,
"job_name": "job_name",
"request_id": "request_id",
}
Expand Down Expand Up @@ -435,7 +435,7 @@ class Config:
class AudioRequest(BaseRequest):
"""Request model for the ASR audio file and url endpoint."""

dual_channel: bool = False
multi_channel: bool = False

class Config:
"""Pydantic config class."""
Expand All @@ -460,7 +460,7 @@ class Config:
"log_prob_threshold": -1.0,
"no_speech_threshold": 0.6,
"condition_on_previous_text": True,
"dual_channel": False,
"multi_channel": False,
}
}

Expand Down
43 changes: 18 additions & 25 deletions src/wordcab_transcribe/router/v1/audio_file_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@
from wordcab_transcribe.models import AudioRequest, AudioResponse
from wordcab_transcribe.utils import (
check_num_channels,
convert_file_to_wav,
delete_file,
process_audio_file,
save_file_locally,
split_dual_channel_file,
)

router = APIRouter()
Expand All @@ -49,7 +48,7 @@ async def inference_with_audio( # noqa: C901
offset_end: float = Form(None), # noqa: B008
num_speakers: int = Form(-1), # noqa: B008
diarization: bool = Form(False), # noqa: B008
dual_channel: bool = Form(False), # noqa: B008
multi_channel: bool = Form(False), # noqa: B008
source_lang: str = Form("en"), # noqa: B008
timestamps: str = Form("s"), # noqa: B008
vocab: List[str] = Form([]), # noqa: B008
Expand Down Expand Up @@ -87,30 +86,24 @@ async def inference_with_audio( # noqa: C901
log_prob_threshold=log_prob_threshold,
no_speech_threshold=no_speech_threshold,
condition_on_previous_text=condition_on_previous_text,
dual_channel=dual_channel,
multi_channel=multi_channel,
)

if data.dual_channel:
num_channels = await check_num_channels(filename)
num_channels = await check_num_channels(filename)
print(f"num_channels: {num_channels}")
if num_channels > 1 and data.multi_channel is False:
num_channels = 1 # Force mono channel if more than 1 channel

if num_channels == 2:
filepath = await split_dual_channel_file(filename)
data.dual_channel = True
else:
logger.error(
"Only 1 audio channel detected, fallback to single channel mode."
)
data.dual_channel = False

if not data.dual_channel:
try:
filepath = await convert_file_to_wav(filename)
try:
filepath: Union[str, List[str]] = await process_audio_file(
filename, num_channels=num_channels
)

except Exception as e:
raise HTTPException( # noqa: B904
status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Process failed: {e}",
)
except Exception as e:
raise HTTPException( # noqa: B904
status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Process failed: {e}",
)

background_tasks.add_task(delete_file, filepath=filename)

Expand All @@ -121,7 +114,7 @@ async def inference_with_audio( # noqa: C901
offset_end=data.offset_end,
num_speakers=data.num_speakers,
diarization=data.diarization,
dual_channel=data.dual_channel,
multi_channel=data.multi_channel,
source_lang=data.source_lang,
timestamps_format=data.timestamps,
vocab=data.vocab,
Expand Down Expand Up @@ -153,7 +146,7 @@ async def inference_with_audio( # noqa: C901
offset_end=data.offset_end,
num_speakers=data.num_speakers,
diarization=data.diarization,
dual_channel=data.dual_channel,
multi_channel=data.multi_channel,
source_lang=data.source_lang,
timestamps=data.timestamps,
vocab=data.vocab,
Expand Down
40 changes: 16 additions & 24 deletions src/wordcab_transcribe/router/v1/audio_url_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""Audio url endpoint for the Wordcab Transcribe API."""

import asyncio
from typing import Optional
from typing import List, Optional, Union

import shortuuid
from fastapi import APIRouter, BackgroundTasks, HTTPException
Expand All @@ -31,10 +31,9 @@
from wordcab_transcribe.models import AudioRequest, AudioResponse
from wordcab_transcribe.utils import (
check_num_channels,
convert_file_to_wav,
delete_file,
download_audio_file,
split_dual_channel_file,
process_audio_file,
)

router = APIRouter()
Expand All @@ -54,27 +53,20 @@ async def inference_with_audio_url(
async with download_limit:
_filepath = await download_audio_file("url", url, filename)

if data.dual_channel:
num_channels = await check_num_channels(filename)
num_channels = await check_num_channels(_filepath)
if num_channels > 1 and data.multi_channel is False:
num_channels = 1 # Force mono channel if more than 1 channel

if num_channels == 2:
filepath = await split_dual_channel_file(filename)
data.dual_channel = True
else:
logger.error(
"Only 1 audio channel detected, fallback to single channel mode."
)
data.dual_channel = False

if not data.dual_channel:
try:
filepath = await convert_file_to_wav(_filepath)
try:
filepath: Union[str, List[str]] = await process_audio_file(
_filepath, num_channels=num_channels
)

except Exception as e:
raise HTTPException( # noqa: B904
status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Process failed: {e}",
)
except Exception as e:
raise HTTPException( # noqa: B904
status_code=http_status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Process failed: {e}",
)

background_tasks.add_task(delete_file, filepath=filename)

Expand All @@ -85,7 +77,7 @@ async def inference_with_audio_url(
offset_end=data.offset_end,
num_speakers=data.num_speakers,
diarization=data.diarization,
dual_channel=data.dual_channel,
multi_channel=data.multi_channel,
source_lang=data.source_lang,
timestamps_format=data.timestamps,
vocab=data.vocab,
Expand Down Expand Up @@ -117,7 +109,7 @@ async def inference_with_audio_url(
offset_end=data.offset_end,
num_speakers=data.num_speakers,
diarization=data.diarization,
dual_channel=data.dual_channel,
multi_channel=data.multi_channel,
source_lang=data.source_lang,
timestamps=data.timestamps,
vocab=data.vocab,
Expand Down
2 changes: 1 addition & 1 deletion src/wordcab_transcribe/router/v1/cortex_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ async def run_cortex(
offset_end=payload.offset_end,
num_speakers=payload.num_speakers,
diarization=payload.diarization,
dual_channel=payload.dual_channel,
multi_channel=payload.multi_channel,
source_lang=payload.source_lang,
timestamps=payload.timestamps,
vocab=payload.vocab,
Expand Down
2 changes: 1 addition & 1 deletion src/wordcab_transcribe/router/v1/youtube_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ async def inference_with_youtube(
offset_end=data.offset_end,
num_speakers=data.num_speakers,
diarization=data.diarization,
dual_channel=False,
multi_channel=False,
source_lang=data.source_lang,
timestamps_format=data.timestamps,
vocab=data.vocab,
Expand Down
Loading

0 comments on commit b117d53

Please sign in to comment.