added source_lang setting, option to change timestamp format

Wordcab · Mar 31, 2023 · e663673 · e663673
1 parent 09bd4e6
commit e663673
Show file tree

Hide file tree

Showing 5 changed files with 144 additions and 57 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+models
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,5 @@
 FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
 
-COPY requirements.txt /requirements.txt
 RUN apt-get update && apt-get install -y \
     git \
     curl \
@@ -14,6 +13,8 @@ RUN add-apt-repository ppa:deadsnakes/ppa \
 RUN python3.10 -m pip install -r requirements.txt
 RUN python3.10 -m pip install --upgrade torch==1.13.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117
 
+COPY requirements.txt /requirements.txt
+
 COPY . /app
 WORKDIR /app
 

diff --git a/requirements.txt b/requirements.txt
@@ -8,5 +8,6 @@ pydantic>=1.10.7
 python-dotenv>=1.0.0
 python-multipart>=0.0.6
 scikit-learn>=1.2.2
+shortuuid>=1.0.0
 uvicorn>=0.21.1
 yt-dlp>=2023.3.4
diff --git a/wordcab_transcribe/main.py b/wordcab_transcribe/main.py
@@ -11,23 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """Main API module of the Wordcab Transcribe."""
 
-import aiofiles
-import asyncio
 import random
+import asyncio
+
+import aiofiles
+import shortuuid
 from loguru import logger
+from typing import Optional
 
-from fastapi import BackgroundTasks, FastAPI, File, UploadFile
 from fastapi import status as http_status
 from fastapi.responses import HTMLResponse
+from fastapi import BackgroundTasks, FastAPI, File, UploadFile
 
 from wordcab_transcribe.config import settings
 from wordcab_transcribe.models import ASRResponse
 from wordcab_transcribe.service import ASRService
 from wordcab_transcribe.utils import convert_file_to_wav, delete_file, download_file_from_youtube
 
 
+
 app = FastAPI(
     title=settings.project_name,
     version=settings.version,
@@ -79,15 +84,22 @@ async def health_check():
 async def inference_with_audio(
     background_tasks: BackgroundTasks,
     file: UploadFile = File(...),
-    num_speakers: int | None = None,
+    num_speakers: Optional[int] = 0,
+    source_lang: Optional[str] = "en",
+    timestamps: Optional[str] = "seconds",
 ):
     """
     Inference endpoint.
 
     Args:
         background_tasks (BackgroundTasks): Background tasks dependency.
         file (UploadFile): Audio file.
-        num_speakers (int): Number of speakers in the audio file. Default: 0.
+        num_speakers (int): Number of speakers to detect; defaults to 0, which
+                attempts to detect the number of speaker.
+        source_lang (str): The language of the source file; defaults to "en".
+        timestamps (str): The format of the transcript timestamps. Options
+            are "seconds", "milliseconds", or "hms," which stands for hours,
+            minutes, seconds. Defaults to "seconds".
 
     Returns:
         ASRResponse: Response data.
@@ -101,10 +113,9 @@ async def inference_with_audio(
             response = requests.post("url/api/v1/audio", files=files)
             print(response.json())
     """
-    num_speakers = num_speakers or 0
     extension = file.filename.split(".")[-1]
+    filename = f"audio_{shortuuid.ShortUUID().random(length=32)}.{extension}"
 
-    filename = f"audio_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=32))}.{extension}"
     async with aiofiles.open(filename, "wb") as f:
         audio_bytes = await file.read()
         await f.write(audio_bytes)
@@ -115,12 +126,12 @@ async def inference_with_audio(
     else:
         filepath = filename
 
-    utterances = await asr.process_input(filepath, num_speakers)
+    utterances = await asr.process_input(filepath, num_speakers, source_lang, timestamps)
     utterances = [
         {
-            "start": float(utterance["start"]),
             "text": str(utterance["text"]),
-            "end": float(utterance["end"]),
+            "start": utterance["start"],
+            "end": utterance["end"],
             "speaker": int(utterance["speaker"]),
         }
         for utterance in utterances
@@ -140,7 +151,9 @@ async def inference_with_audio(
 async def inference_with_youtube(
     background_tasks: BackgroundTasks,
     url: str,
-    num_speakers: int | None = None,
+    num_speakers: Optional[int] = 0,
+    source_lang: Optional[str] = "en",
+    timestamps: Optional[str] = "seconds",
 ):
     """
     Inference endpoint.
@@ -161,15 +174,15 @@ async def inference_with_youtube(
     """
     num_speakers = num_speakers or 0
 
-    filename = f"audio_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=32))}"
+    filename = f"yt_{shortuuid.ShortUUID().random(length=32)}"
     filepath = await download_file_from_youtube(url, filename)
 
-    utterances = await asr.process_input(filepath, num_speakers)
+    utterances = await asr.process_input(filepath, num_speakers, source_lang, timestamps)
     utterances = [
         {
-            "start": float(utterance["start"]),
             "text": str(utterance["text"]),
-            "end": float(utterance["end"]),
+            "start": utterance["start"],
+            "end": utterance["end"],
             "speaker": int(utterance["speaker"]),
         }
         for utterance in utterances