Modularize tricky dependencies (#95)

* removed pyaudio from threaded transport * modularized torch and torchaudio * modularized local transport * Working Dockerfile as well * docker updates for fly.io
pipecat-ai · Apr 3, 2024 · 2f59e38 · 2f59e38
1 parent c210148
commit 2f59e38
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 37 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,30 @@
+# flyctl launch added from .gitignore
+**/.vscode
+**/env
+**/__pycache__
+**/*~
+**/venv
+#*#
+
+# Distribution / packaging
+**/.Python
+**/build
+**/develop-eggs
+**/dist
+**/downloads
+**/eggs
+**/.eggs
+**/lib
+**/lib64
+**/parts
+**/sdist
+**/var
+**/wheels
+**/share/python-wheels
+**/*.egg-info
+**/.installed.cfg
+**/*.egg
+**/MANIFEST
+**/.DS_Store
+**/.env
+fly.toml
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,4 @@ share/python-wheels/
 MANIFEST
 .DS_Store
 .env
+fly.toml
diff --git a/examples/server/Dockerfile → Dockerfile b/examples/server/Dockerfile → Dockerfile
@@ -7,13 +7,14 @@ COPY *.py /app
 COPY pyproject.toml /app
 
 COPY src/ /app/src/
+COPY examples/ /app/examples/
 
 WORKDIR /app
 RUN ls --recursive /app/
 RUN pip3 install --upgrade -r requirements.txt
 RUN python -m build .
 RUN pip3 install .
-
+RUN pip3 install gunicorn
 # If running on Ubuntu, Azure TTS requires some extra config
 # https://learn.microsoft.com/en-us/azure/ai-services/speech-service/quickstarts/setup-platform?pivots=programming-language-python&tabs=linux%2Cubuntu%2Cdotnetcli%2Cdotnet%2Cjre%2Cmaven%2Cnodejs%2Cmac%2Cpypi
 
@@ -36,4 +37,4 @@ WORKDIR /app
 
 EXPOSE 8000
 # run
-CMD ["gunicorn", "--workers=2", "--log-level", "debug", "--capture-output", "daily-bot-manager:app", "--bind=0.0.0.0:8000"]
+CMD ["gunicorn", "--workers=2", "--log-level", "debug", "--chdir", "examples/server", "--capture-output", "daily-bot-manager:app", "--bind=0.0.0.0:8000"]
diff --git a/examples/server/daily-bot-manager.py b/examples/server/daily-bot-manager.py
@@ -14,10 +14,10 @@
 CORS(app)
 
 APPS = {
-    "chatbot": "examples/starter-apps/chatbot.py",
-    "patient-intake": "examples/starter-apps/patient-intake.py",
-    "storybot": "examples/starter-apps/storybot.py",
-    "translator": "examples/starter-apps/translator.py"
+    "chatbot": "../starter-apps/chatbot.py",
+    "patient-intake": "../starter-apps/patient-intake.py",
+    "storybot": "../starter-apps/storybot.py",
+    "translator": "../starter-apps/translator.py"
 }
 
 daily_api_key = os.getenv("DAILY_API_KEY")
@@ -157,7 +157,7 @@ def start(botname):
         else:
             return jsonify({"room_url": room_url, "token": token})
     except BaseException as e:
-        return "There was a problem starting the bot: {e}", 500
+        return f"There was a problem starting the bot: {e}", 500
 
 
 @app.route("/healthz")

diff --git a/examples/starter-apps/translator.py b/examples/starter-apps/translator.py
@@ -84,7 +84,6 @@ async def main(room_url: str, token):
             mic_enabled=True,
             mic_sample_rate=16000,
             camera_enabled=False,
-            vad_enabled=True,
         )
         tts = AzureTTSService(
             api_key=os.getenv("AZURE_SPEECH_API_KEY"),
@@ -98,7 +97,7 @@ async def main(room_url: str, token):
         tp = TranslationProcessor("Spanish")
         lfra = LLMFullResponseAggregator()
         ts = TranslationSubtitles("spanish")
-        pipeline = Pipeline([sa, tp, llm, lfra, ts])
+        pipeline = Pipeline([sa, tp, llm, lfra, ts, tts])
 
         transport.transcription_settings["extra"]["endpointing"] = True
         transport.transcription_settings["extra"]["punctuate"] = True

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,15 +26,14 @@ dependencies = [
     "daily-python",
     "fal",
     "faster_whisper",
+    "flask",
+    "flask_cors",
     "google-cloud-texttospeech",
     "numpy",
     "openai",
     "Pillow",
     "pyht",
     "python-dotenv",
-    "torch",
-    "torchaudio",
-    "pyaudio",
     "typing-extensions",
     "websockets"
 ]

diff --git a/src/dailyai/transports/local_transport.py b/src/dailyai/transports/local_transport.py
@@ -1,14 +1,20 @@
 import asyncio
 import numpy as np
 import tkinter as tk
-import pyaudio
 
 from dailyai.transports.threaded_transport import ThreadedTransport
 
 
 class LocalTransport(ThreadedTransport):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        try:
+            global pyaudio
+            import pyaudio
+        except ModuleNotFoundError as e:
+            print(f"Exception: {e}")
+            print("In order to use the local transport, you'll need to `pip install pyaudio`. On MacOS, you'll also need to `brew install portaudio`.")
+            raise Exception(f"Missing module: {e}")
         self._sample_width = kwargs.get("sample_width") or 2
         self._n_channels = kwargs.get("n_channels") or 1
         self._tk_root = kwargs.get("tk_root") or None

diff --git a/src/dailyai/transports/threaded_transport.py b/src/dailyai/transports/threaded_transport.py
@@ -3,8 +3,7 @@
 import itertools
 import logging
 import numpy as np
-import pyaudio
-import torch
+
 import queue
 import threading
 import time
@@ -29,22 +28,6 @@
 from dailyai.services.ai_services import TTSService
 from dailyai.transports.abstract_transport import AbstractTransport
 
-torch.set_num_threads(1)
-
-model, utils = torch.hub.load(
-    repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
-)
-
-(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
-
-# Taken from utils_vad.py
-
-
-def validate(model, inputs: torch.Tensor):
-    with torch.no_grad():
-        outs = model(inputs)
-    return outs
-
 
 # Provided by Alexander Veysov
 
@@ -58,12 +41,7 @@ def int2float(sound):
     return sound
 
 
-FORMAT = pyaudio.paInt16
-CHANNELS = 1
 SAMPLE_RATE = 16000
-CHUNK = int(SAMPLE_RATE / 10)
-
-audio = pyaudio.PyAudio()
 
 
 class VADState(Enum):
@@ -90,6 +68,24 @@ def __init__(
                 "Sorry, you can't use speaker_enabled and vad_enabled at the same time. Please set one to False."
             )
 
+        if self._vad_enabled:
+            try:
+                global torch, torchaudio
+                import torch
+                # We don't use torchaudio here, but we need to try importing it because
+                # Silero uses it
+                import torchaudio
+                torch.set_num_threads(1)
+
+                (self.model, self.utils) = torch.hub.load(
+                    repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
+                )
+
+            except ModuleNotFoundError as e:
+                print(f"Exception: {e}")
+                print("In order to use VAD, you'll need to install the `torch` and `torchaudio` modules.")
+                raise Exception(f"Missing module(s): {e}")
+
         self._vad_samples = 1536
         vad_frame_s = self._vad_samples / SAMPLE_RATE
         self._vad_start_frames = round(self._vad_start_s / vad_frame_s)
@@ -276,7 +272,7 @@ def _vad(self):
             audio_chunk = self.read_audio_frames(self._vad_samples)
             audio_int16 = np.frombuffer(audio_chunk, np.int16)
             audio_float32 = int2float(audio_int16)
-            new_confidence = model(
+            new_confidence = self.model(
                 torch.from_numpy(audio_float32), 16000).item()
             speaking = new_confidence > 0.5