Websocket transport

pipecat-ai · Mar 24, 2024 · 695d060 · 695d060
1 parent 2c5628a
commit 695d060
Show file tree

Hide file tree

Showing 13 changed files with 630 additions and 19 deletions.
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -22,11 +22,23 @@ jobs:
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Setup virtual environment
+        run: |
+          python -m venv .venv
+      - name: Install basic Python dependencies
+        run: |
+          source .venv/bin/activate
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
       - name: autopep8
         id: autopep8
-        uses: peter-evans/autopep8@v2
-        with:
-          args: --exit-code -r -d -a -a src/
+        run: |
+          source .venv/bin/activate
+          autopep8 --exit-code -r -d --exclude "*_pb2.py" -a -a src/
       - name: Fail if autopep8 requires changes
         if: steps.autopep8.outputs.exit-code == 2
         run: exit 1
diff --git a/examples/foundational/websocket-server/frames.proto b/examples/foundational/websocket-server/frames.proto
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+package dailyai_proto;
+
+message TextFrame {
+    string text = 1;
+}
+
+message AudioFrame {
+    bytes audio = 1;
+}
+
+message TranscriptionFrame {
+    string text = 1;
+    string participant_id = 2;
+    string timestamp = 3;
+}
+
+message Frame {
+    oneof frame {
+        TextFrame text = 1;
+        AudioFrame audio = 2;
+        TranscriptionFrame transcription = 3;
+    }
+}
diff --git a/examples/foundational/websocket-server/index.html b/examples/foundational/websocket-server/index.html
@@ -0,0 +1,134 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <script src="//cdn.jsdelivr.net/npm/[email protected]/dist/protobuf.min.js"></script>
+    <title>WebSocket Audio Stream</title>
+</head>
+
+<body>
+    <h1>WebSocket Audio Stream</h1>
+    <button id="startAudioBtn">Start Audio</button>
+    <button id="stopAudioBtn">Stop Audio</button>
+    <script>
+        const SAMPLE_RATE = 16000;
+        const BUFFER_SIZE = 8192;
+        const MIN_AUDIO_SIZE = 6400;
+
+        let audioContext;
+        let microphoneStream;
+        let scriptProcessor;
+        let source;
+        let frame;
+        let audioChunks = [];
+        let isPlaying = false;
+        let ws;
+
+        const proto = protobuf.load("frames.proto", (err, root) => {
+            if (err) throw err;
+            frame = root.lookupType("dailyai_proto.Frame");
+        });
+
+        function initWebSocket() {
+            ws = new WebSocket('ws://localhost:8765');
+
+            ws.addEventListener('open', () => console.log('WebSocket connection established.'));
+            ws.addEventListener('message', handleWebSocketMessage);
+            ws.addEventListener('close', (event) => console.log("WebSocket connection closed.", event.code, event.reason));
+            ws.addEventListener('error', (event) => console.error('WebSocket error:', event));
+        }
+
+        async function handleWebSocketMessage(event) {
+            const arrayBuffer = await event.data.arrayBuffer();
+            enqueueAudioFromProto(arrayBuffer);
+        }
+
+        function enqueueAudioFromProto(arrayBuffer) {
+            const parsedFrame = frame.decode(new Uint8Array(arrayBuffer));
+            if (!parsedFrame?.audio) return false;
+
+            const frameCount = parsedFrame.audio.audio.length / 2;
+            const audioOutBuffer = audioContext.createBuffer(1, frameCount, SAMPLE_RATE);
+            const nowBuffering = audioOutBuffer.getChannelData(0);
+            const view = new Int16Array(parsedFrame.audio.audio.buffer);
+
+            for (let i = 0; i < frameCount; i++) {
+                const word = view[i];
+                nowBuffering[i] = ((word + 32768) % 65536 - 32768) / 32768.0;
+            }
+
+            audioChunks.push(audioOutBuffer);
+            if (!isPlaying) playNextChunk();
+        }
+
+        function playNextChunk() {
+            if (audioChunks.length === 0) {
+                isPlaying = false;
+                return;
+            }
+
+            isPlaying = true;
+            const audioOutBuffer = audioChunks.shift();
+            const source = audioContext.createBufferSource();
+            source.buffer = audioOutBuffer;
+            source.connect(audioContext.destination);
+            source.onended = playNextChunk;
+            source.start();
+        }
+
+        function startAudio() {
+            if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+                alert('getUserMedia is not supported in your browser.');
+                return;
+            }
+
+            navigator.mediaDevices.getUserMedia({ audio: true })
+                .then((stream) => {
+                    microphoneStream = stream;
+                    audioContext = new (window.AudioContext || window.webkitAudioContext)();
+                    scriptProcessor = audioContext.createScriptProcessor(BUFFER_SIZE, 1, 1);
+                    source = audioContext.createMediaStreamSource(stream);
+                    source.connect(scriptProcessor);
+                    scriptProcessor.connect(audioContext.destination);
+
+                    const audioBuffer = [];
+                    const skipRatio = Math.floor(audioContext.sampleRate / (SAMPLE_RATE * 2));
+
+                    scriptProcessor.onaudioprocess = (event) => {
+                        const rawLeftChannelData = event.inputBuffer.getChannelData(0);
+                        for (let i = 0; i < rawLeftChannelData.length; i += skipRatio) {
+                            const normalized = ((rawLeftChannelData[i] * 32768.0) + 32768) % 65536 - 32768;
+                            const swappedBytes = ((normalized & 0xff) << 8) | ((normalized >> 8) & 0xff);
+                            audioBuffer.push(swappedBytes);
+                        }
+
+                        if (audioBuffer.length >= MIN_AUDIO_SIZE) {
+                            const audioFrame = frame.create({ audio: { audio: audioBuffer.slice(0, MIN_AUDIO_SIZE) } });
+                            const encodedFrame = new Uint8Array(frame.encode(audioFrame).finish());
+                            ws.send(encodedFrame);
+                            audioBuffer.splice(0, MIN_AUDIO_SIZE);
+                        }
+                    };
+
+                    initWebSocket();
+                })
+                .catch((error) => console.error('Error accessing microphone:', error));
+        }
+
+        function stopAudio() {
+            if (ws) {
+                ws.close();
+                scriptProcessor.disconnect();
+                source.disconnect();
+                ws = undefined;
+            }
+        }
+
+        document.getElementById('startAudioBtn').addEventListener('click', startAudio);
+        document.getElementById('stopAudioBtn').addEventListener('click', stopAudio);
+    </script>
+</body>
+
+</html>
diff --git a/examples/foundational/websocket-server/sample.py b/examples/foundational/websocket-server/sample.py
@@ -0,0 +1,50 @@
+import asyncio
+import aiohttp
+import logging
+import os
+from dailyai.pipeline.frame_processor import FrameProcessor
+from dailyai.pipeline.frames import TextFrame, TranscriptionQueueFrame
+from dailyai.pipeline.pipeline import Pipeline
+from dailyai.services.elevenlabs_ai_service import ElevenLabsTTSService
+from dailyai.services.websocket_transport_service import WebsocketTransport
+from dailyai.services.whisper_ai_services import WhisperSTTService
+
+logging.basicConfig(format="%(levelno)s %(asctime)s %(message)s")
+logger = logging.getLogger("dailyai")
+logger.setLevel(logging.DEBUG)
+
+
+class WhisperTranscriber(FrameProcessor):
+    async def process_frame(self, frame):
+        if isinstance(frame, TranscriptionQueueFrame):
+            print(f"Transcribed: {frame.text}")
+        else:
+            yield frame
+
+
+async def main():
+    async with aiohttp.ClientSession() as session:
+        transport = WebsocketTransport(
+            mic_enabled=True,
+            speaker_enabled=True,
+        )
+        tts = ElevenLabsTTSService(
+            aiohttp_session=session,
+            api_key=os.getenv("ELEVENLABS_API_KEY"),
+            voice_id=os.getenv("ELEVENLABS_VOICE_ID"),
+        )
+
+        pipeline = Pipeline([
+            WhisperSTTService(),
+            WhisperTranscriber(),
+            tts,
+        ])
+
+        @transport.on_connection
+        async def queue_frame():
+            await pipeline.queue_frames([TextFrame("Hello there!")])
+
+        await transport.run(pipeline)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,8 @@ dependencies = [
     "torch",
     "torchaudio",
     "pyaudio",
-    "typing-extensions"
+    "typing-extensions",
+    "websockets"
 ]
 
 [project.urls]

diff --git a/src/dailyai/pipeline/frames.proto b/src/dailyai/pipeline/frames.proto
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+package dailyai_proto;
+
+message TextFrame {
+    string text = 1;
+}
+
+message AudioFrame {
+    bytes audio = 1;
+}
+
+message TranscriptionFrame {
+    string text = 1;
+    string participant_id = 2;
+    string timestamp = 3;
+}
+
+message Frame {
+    oneof frame {
+        TextFrame text = 1;
+        AudioFrame audio = 2;
+        TranscriptionFrame transcription = 3;
+    }
+}