Skip to content

Commit

Permalink
added source_lang setting, option to change timestamp format
Browse files Browse the repository at this point in the history
  • Loading branch information
aleksandr-smechov committed Mar 31, 2023
1 parent 09bd4e6 commit e663673
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 57 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
models
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
FROM nvidia/cuda:11.7.0-devel-ubuntu22.04

COPY requirements.txt /requirements.txt
RUN apt-get update && apt-get install -y \
git \
curl \
Expand All @@ -14,6 +13,8 @@ RUN add-apt-repository ppa:deadsnakes/ppa \
RUN python3.10 -m pip install -r requirements.txt
RUN python3.10 -m pip install --upgrade torch==1.13.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117

COPY requirements.txt /requirements.txt

COPY . /app
WORKDIR /app

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ pydantic>=1.10.7
python-dotenv>=1.0.0
python-multipart>=0.0.6
scikit-learn>=1.2.2
shortuuid>=1.0.0
uvicorn>=0.21.1
yt-dlp>=2023.3.4
43 changes: 28 additions & 15 deletions wordcab_transcribe/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Main API module of the Wordcab Transcribe."""

import aiofiles
import asyncio
import random
import asyncio

import aiofiles
import shortuuid
from loguru import logger
from typing import Optional

from fastapi import BackgroundTasks, FastAPI, File, UploadFile
from fastapi import status as http_status
from fastapi.responses import HTMLResponse
from fastapi import BackgroundTasks, FastAPI, File, UploadFile

from wordcab_transcribe.config import settings
from wordcab_transcribe.models import ASRResponse
from wordcab_transcribe.service import ASRService
from wordcab_transcribe.utils import convert_file_to_wav, delete_file, download_file_from_youtube



app = FastAPI(
title=settings.project_name,
version=settings.version,
Expand Down Expand Up @@ -79,15 +84,22 @@ async def health_check():
async def inference_with_audio(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
num_speakers: int | None = None,
num_speakers: Optional[int] = 0,
source_lang: Optional[str] = "en",
timestamps: Optional[str] = "seconds",
):
"""
Inference endpoint.
Args:
background_tasks (BackgroundTasks): Background tasks dependency.
file (UploadFile): Audio file.
num_speakers (int): Number of speakers in the audio file. Default: 0.
num_speakers (int): Number of speakers to detect; defaults to 0, which
attempts to detect the number of speaker.
source_lang (str): The language of the source file; defaults to "en".
timestamps (str): The format of the transcript timestamps. Options
are "seconds", "milliseconds", or "hms," which stands for hours,
minutes, seconds. Defaults to "seconds".
Returns:
ASRResponse: Response data.
Expand All @@ -101,10 +113,9 @@ async def inference_with_audio(
response = requests.post("url/api/v1/audio", files=files)
print(response.json())
"""
num_speakers = num_speakers or 0
extension = file.filename.split(".")[-1]
filename = f"audio_{shortuuid.ShortUUID().random(length=32)}.{extension}"

filename = f"audio_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=32))}.{extension}"
async with aiofiles.open(filename, "wb") as f:
audio_bytes = await file.read()
await f.write(audio_bytes)
Expand All @@ -115,12 +126,12 @@ async def inference_with_audio(
else:
filepath = filename

utterances = await asr.process_input(filepath, num_speakers)
utterances = await asr.process_input(filepath, num_speakers, source_lang, timestamps)
utterances = [
{
"start": float(utterance["start"]),
"text": str(utterance["text"]),
"end": float(utterance["end"]),
"start": utterance["start"],
"end": utterance["end"],
"speaker": int(utterance["speaker"]),
}
for utterance in utterances
Expand All @@ -140,7 +151,9 @@ async def inference_with_audio(
async def inference_with_youtube(
background_tasks: BackgroundTasks,
url: str,
num_speakers: int | None = None,
num_speakers: Optional[int] = 0,
source_lang: Optional[str] = "en",
timestamps: Optional[str] = "seconds",
):
"""
Inference endpoint.
Expand All @@ -161,15 +174,15 @@ async def inference_with_youtube(
"""
num_speakers = num_speakers or 0

filename = f"audio_{''.join(random.choices('abcdefghijklmnopqrstuvwxyz0123456789', k=32))}"
filename = f"yt_{shortuuid.ShortUUID().random(length=32)}"
filepath = await download_file_from_youtube(url, filename)

utterances = await asr.process_input(filepath, num_speakers)
utterances = await asr.process_input(filepath, num_speakers, source_lang, timestamps)
utterances = [
{
"start": float(utterance["start"]),
"text": str(utterance["text"]),
"end": float(utterance["end"]),
"start": utterance["start"],
"end": utterance["end"],
"speaker": int(utterance["speaker"]),
}
for utterance in utterances
Expand Down
Loading

0 comments on commit e663673

Please sign in to comment.