Add whisper (#14)

* Update config * Add whisper * Update requirements.txt * Add functions to generate subtitles from whisper result * Update service * Update transcribers * Update README.md * Repair requirements.txt * Update dockerfile * Only download models on transcriber=vosk * Update whisper.generate Did that fix the silence issue? * Fix whisper silence issue * Remove print * Add advanced error handling
TUM-Dev · Dec 12, 2022 · 8d3ede6 · 8d3ede6
1 parent 30b7fa8
commit 8d3ede6
Show file tree

Hide file tree

Showing 10 changed files with 272 additions and 132 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -9,6 +9,9 @@ ADD config.yml .
 # Vosk Dependencies
 RUN apt-get update -y && apt-get install -y ffmpeg
 
+# Whisper Dependencies
+RUN apt-get install -y git
+
 
 # Install Python modules
 RUN pip install --no-cache-dir -r requirements.txt

diff --git a/README.md b/README.md
@@ -2,9 +2,7 @@
 
 Microservice that generates subtitles for [TUM-Live](https://live.rbg.tum.de).
 
-## Usage
-
-### Workflow
+## Workflow
 
 ```
                      ┌──────────┐
@@ -27,7 +25,7 @@ Microservice that generates subtitles for [TUM-Live](https://live.rbg.tum.de).
 └────────────┘                            └───────────┘
 ```
 
-### API
+## API
 
 ```bash
 $ grpcurl -plaintext localhost:50055 list voice.SubtitleGenerator
@@ -44,64 +42,89 @@ $ grpcurl -plaintext \
 
 ## Installation
 
-### Run with virtual environment
+### Python virtual environment
 
 ```bash 
 $ git clone https://github.com/TUM-Dev/TUM-Live-Voice-Service.git
 Cloning into 'TUM-Live-Voice-Service'...
-...
 $ cd TUM-Live-Voice-Service
 $ python -m venv venv
 $ source venv/bin/activate
 (venv) $ pip install --no-cache-dir -r requirements.txt 
 (venv) $ DEBUG=. CONFIG_FILE=./config.yml python3.9 subtitles/subtitles.py
 ...
-...
 ```
 
 Or simply use an open source IDE like [PyCharm CE](https://www.jetbrains.com/pycharm/).
 
 ### Docker
 
-#### build
-
-```bash
-$ docker build --no-cache -t voice-service-image .
-[+] Building 0.4s (1/10)...
-```
-
-#### run application
-
 ```bash
 $ docker run -p 50055:50055 \
   --name voice-service \
   -v /srv/static:/data \
   -e CONFIG_FILE=./config.yml \
   -e DEBUG=.\
   -d \
-  voice-service-image
+  ghcr.io/tum-dev/tum-live-voice-service:latest
 ```
 
-### Configuration 
+## Configuration 
 
 You can configure the application with: 
 - YAML file 
 - .env file and environment variables
 
-**Configuration precedence** 
-
-`>` = _overwrites_: `environment > .env > .yml`
+**Configuration precedence** (`>` = _overwrites_): `environment > .env > .yml`
 
-#### Examplary .env file 
+<details><summary>Examplary .env file </summary>
+<p>
 
 ```bash
 API_PORT=51000
 REC_HOST=127.0.0.1
 REC_PORT=51001
 VOSK_MODEL_DIR=/data
 VOSK_DWNLD_URLS=https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip,https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip
-VOSK_MODELS=/data/fr:fr,/data/en:en
+VOSK_MODELS=model-fr:fr,model-en:en
+WHISPER_MODEL=medium
 ```
+</p>
+</details>
+
+<details><summary>Examplary YAML file </summary>
+<p>
+
+```YAML
+api:
+  port: 50055
+receiver:
+  host: localhost
+  port: 50053
+transcriber: 'whisper'
+vosk:
+  model_dir: '/data'
+  download_urls:
+    - https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
+    - https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip
+  models:
+    - name: 'vosk-model-small-en-us-0.15'
+      lang: 'en'
+    - name: 'data/vosk-model-small-de-0.15'
+      lang: 'de'
+whisper:
+  model: 'tiny'
+```
+</p>
+</details>
+
+## Transcribers
+
+Currently following transcribers are implemented and can be specified in the configuration: 
+  * [whisper](https://github.com/openai/whisper)
+  * [vosk](https://github.com/alphacep/vosk-api)
+  
+Which transcriber one chooses depends immensely on the use case and computing power available. We found that whisper produces much higher quality results, especially regarding punctuation, but is much more compute-heavy.
 
 ## License
 

diff --git a/config.yml b/config.yml
@@ -3,13 +3,16 @@ api:
 receiver:
   host: localhost
   port: 50053
+transcriber: 'whisper'
 vosk:
   model_dir: '/data'
   download_urls:
     - https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
     - https://alphacephei.com/vosk/models/vosk-model-small-de-0.15.zip
   models:
-    - path: '/data/vosk-model-small-en-us-0.15/'
+    - name: 'vosk-model-small-en-us-0.15'
       lang: 'en'
-    - path: '/data/vosk-model-small-de-0.15/'
-      lang: 'de'
+    - name: 'data/vosk-model-small-de-0.15'
+      lang: 'de'
+whisper:
+  model: 'tiny'
diff --git a/requirements.txt b/requirements.txt
@@ -1,22 +1,32 @@
-certifi==2022.6.15
+certifi==2022.9.24
 cffi==1.15.1
 charset-normalizer==2.1.1
-flake8==5.0.4
-grpcio==1.47.0
-grpcio-reflection==1.47.0
-grpcio-tools==1.47.0
-idna==3.3
-mccabe==0.7.0
-protobuf==3.20.1
-pycodestyle==2.9.1
+ffmpeg-python==0.2.0
+filelock==3.8.2
+future==0.18.2
+grpcio==1.51.1
+grpcio-reflection==1.51.1
+grpcio-tools==1.51.1
+huggingface-hub==0.11.1
+idna==3.4
+install==1.3.5
+more-itertools==9.0.0
+numpy==1.23.5
+packaging==21.3
+protobuf==4.21.10
 pycparser==2.21
-pyflakes==2.5.0
+pyparsing==3.0.9
 python-dotenv==0.21.0
 PyYAML==6.0
+regex==2022.10.31
 requests==2.28.1
-six==1.16.0
 srt==3.5.2
-tqdm==4.64.0
-urllib3==1.26.12
+tokenizers==0.13.2
+torch==1.13.0
+tqdm==4.64.1
+transformers==4.25.1
+typing_extensions==4.4.0
+urllib3==1.26.13
 vosk==0.3.44
-websockets==10.3
+websockets==10.4
+whisper @ git+https://github.com/openai/whisper.git@4179ed2475cc84cba66868b516232ef1b74dacdf
diff --git a/subtitles/properties.py b/subtitles/properties.py
@@ -1,6 +1,6 @@
-import os.path
 from copy import deepcopy
 from dotenv import load_dotenv
+import os.path
 import yaml
 
 
@@ -43,11 +43,7 @@ class EnvProperties:
     """Config which can be loaded with environment variables file"""
 
     def __init__(self, default=None) -> None:
-        """Initialize Config with a given YAML file.
-
-        Args:
-            path (str): The path to a YAML file.
-        """
+        """Initialize Config with a given .env file."""
         if default is None:
             default = {}
         self._default = default
@@ -57,13 +53,15 @@ def get(self) -> dict:
         """Reads the properties file, overwrites defaults, and returns a dictionary.
 
         Returns:
-            Dictionary (dict) containing the properties.
+            Dictionary containing the properties.
         """
         properties = deepcopy(self._default)
         properties['api']['port'] = os.getenv('API_PORT', properties['api']['port'])
         properties['receiver']['host'] = os.getenv('REC_HOST', properties['receiver']['host'])
         properties['receiver']['port'] = os.getenv('REC_PORT', properties['receiver']['port'])
 
+        properties['transcriber'] = os.getenv('TRANSCRIBER', properties['transcriber'])
+
         properties['vosk']['model_dir'] = os.getenv('VOSK_MODEL_DIR', properties['vosk']['model_dir'])
 
         # format: https://x.com,https://y.com
@@ -76,11 +74,14 @@ def get(self) -> dict:
                       for model in os.getenv('VOSK_MODELS').split(',')]
 
             properties['vosk']['models'] = models
+
+        properties['whisper']['model'] = os.getenv('WHISPER_MODEL', properties['whisper']['model'])
+
         return properties
 
     def __to_model_obj(self, model: str):
         model_lang_pair = model.split(':')
-        return {'path': model_lang_pair[0], 'lang': model_lang_pair[1]}
+        return {'name': model_lang_pair[0], 'lang': model_lang_pair[1]}
 
 
 def _validate(file_path: str, file_type: str) -> None: