updated to include print statements

Picovoice · Dec 30, 2024 · 84bf62c · 84bf62c
1 parent 4d9dc0e
commit 84bf62c
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 19 deletions.
diff --git a/recipes/llm-voice-assistant/python/README.md → .../llm-voice-assistant/python/cli/README.md b/recipes/llm-voice-assistant/python/README.md → .../llm-voice-assistant/python/cli/README.md
diff --git a/recipes/llm-voice-assistant/python/main.py → ...es/llm-voice-assistant/python/cli/main.py b/recipes/llm-voice-assistant/python/main.py → ...es/llm-voice-assistant/python/cli/main.py
@@ -52,6 +52,11 @@ def rtf(self) -> float:
         self._audio_sec = 0.
         return rtf
 
+    def reset(self) -> None:
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        self._tick_sec = 0.
+
 
 class TPSProfiler(object):
     def __init__(self) -> None:
@@ -70,6 +75,10 @@ def tps(self) -> float:
         self._start_sec = 0.
         return tps
 
+    def reset(self) -> None:
+        self._num_tokens = 0
+        self._start_sec = 0.
+
 
 class CompletionText(object):
     def __init__(self, stop_phrases: list) -> None:
@@ -111,9 +120,10 @@ class Speaker:
     def __init__(
             self,
             speaker: PvSpeaker,
-            orca_warmup_sec: int):
+            config):
         self.speaker = speaker
-        self.orca_warmup = self.speaker.sample_rate * orca_warmup_sec
+        self.config = config
+        self.orca_warmup = self.speaker.sample_rate * self.config['orca_warmup_sec']
         self.started = False
         self.speaking = False
         self.flushing = False
@@ -160,7 +170,7 @@ def stop():
             self.future = self.executor.submit(stop)
         if self.future and self.future.done():
             self.future = None
-            ppn_prompt = config['ppn_prompt']
+            ppn_prompt = self.config['ppn_prompt']
             print(f'$ Say {ppn_prompt} ...', flush=True)
 
 
@@ -169,18 +179,20 @@ def __init__(
             self,
             speaker: Speaker,
             orca_connection: Connection,
-            orca_process: Process):
+            orca_process: Process,
+            config):
         self.speaker = speaker
         self.orca_connection = orca_connection
         self.orca_process = orca_process
+        self.config = config
 
     def close(self):
         self.orca_connection.send({'command': Commands.CLOSE})
         self.orca_process.join()
 
-    def start(self):
+    def start(self, utterance_end_sec):
         self.speaker.start()
-        self.orca_connection.send({'command': Commands.START})
+        self.orca_connection.send({'command': Commands.START, 'utterance_end_sec': utterance_end_sec})
 
     def process(self, text: str):
         self.orca_connection.send({'command': Commands.PROCESS, 'text': text})
@@ -200,6 +212,11 @@ def tick(self):
             if message['command'] == Commands.SPEAK:
                 self.speaker.process(message['pcm'])
             elif message['command'] == Commands.FLUSH:
+                if self.config['profile']:
+                    rtf = message['profile']
+                    delay = message['delay']
+                    print(f'[Orca RTF: {round(rtf, 2)}]')
+                    print(f"[Delay: {round(delay, 2)} sec]")
                 self.speaker.flush()
 
     @staticmethod
@@ -218,6 +235,11 @@ def handler(_, __) -> None:
         orca = pvorca.create(access_key=config['access_key'])
         orca_stream = orca.stream_open()
         connection.send(orca.sample_rate)
+        connection.send({'version': orca.version})
+
+        orca_profiler = RTFProfiler(orca.sample_rate)
+        utterance_end_sec = 0
+        delay_sec = -1
 
         try:
             close = False
@@ -231,6 +253,7 @@ def handler(_, __) -> None:
                         close = True
                     elif message['command'] == Commands.START:
                         synthesizing = True
+                        utterance_end_sec = message['utterance_end_sec']
                     elif message['command'] == Commands.PROCESS:
                         if synthesizing:
                             text_queue.put(message['text'])
@@ -243,17 +266,28 @@ def handler(_, __) -> None:
                             text_queue.get()
                         orca_stream.flush()
                         connection.send({'command': Commands.INTERRUPT})
+                        orca_profiler.reset()
+                        utterance_end_sec = 0
+                        delay_sec = -1
                 if not text_queue.empty():
                     text = text_queue.get()
+                    orca_profiler.tick()
                     pcm = orca_stream.synthesize(text)
+                    orca_profiler.tock(pcm)
                     if pcm is not None:
                         connection.send({'command': Commands.SPEAK, 'pcm': pcm})
+                        if delay_sec == -1:
+                            delay_sec = time.perf_counter() - utterance_end_sec
                 if synthesizing and flushing and text_queue.empty():
                     synthesizing = False
                     flushing = False
+                    orca_profiler.tick()
                     pcm = orca_stream.flush()
+                    orca_profiler.tock(pcm)
                     connection.send({'command': Commands.SPEAK, 'pcm': pcm})
-                    connection.send({'command': Commands.FLUSH})
+                    connection.send({'command': Commands.FLUSH, 'profile': orca_profiler.rtf(), 'delay': delay_sec})
+                    utterance_end_sec = 0
+                    delay_sec = -1
                 elif flushing:
                     flushing = False
         finally:
@@ -266,20 +300,22 @@ def __init__(
             self,
             synthesizer: Synthesizer,
             pllm_connection: Connection,
-            pllm_process: Process):
+            pllm_process: Process,
+            config):
         self.synthesizer = synthesizer
         self.pllm_connection = pllm_connection
         self.pllm_process = pllm_process
+        self.config = config
 
     def close(self):
         self.pllm_connection.send({'command': Commands.CLOSE})
         self.pllm_process.join()
 
-    def process(self, text: str):
-        ppn_prompt = config['ppn_prompt']
+    def process(self, text: str, utterance_end_sec):
+        ppn_prompt = self.config['ppn_prompt']
         print(f'LLM (say ${ppn_prompt} to interrupt) > ', end='', flush=True)
 
-        self.synthesizer.start()
+        self.synthesizer.start(utterance_end_sec)
         self.pllm_connection.send({'command': Commands.PROCESS, 'text': text})
 
     def interrupt(self):
@@ -297,6 +333,9 @@ def tick(self):
                 self.synthesizer.process(message['text'])
             elif message['command'] == Commands.FLUSH:
                 print('', flush=True)
+                if self.config['profile']:
+                    tps = message['profile']
+                    print(f'[picoLLM TPS: {round(tps, 2)}]')
                 self.synthesizer.flush()
 
     @staticmethod
@@ -316,12 +355,17 @@ def handler(_, __) -> None:
             access_key=config['access_key'],
             model_path=config['picollm_model_path'],
             device=config['picollm_device'])
+
+        connection.send({'version': pllm.version, 'model': pllm.model})
+
         if config['picollm_system_prompt'] is not None:
             dialog = pllm.get_dialog(system=config['picollm_system_prompt'])
         else:
             dialog = pllm.get_dialog()
         generating = False
 
+        pllm_profiler = TPSProfiler()
+
         stop_phrases = {
             '</s>',  # Llama-2, Mistral, and Mixtral
             '<end_of_turn>',  # Gemma
@@ -332,6 +376,7 @@ def handler(_, __) -> None:
         completion = CompletionText(stop_phrases)
 
         def llm_callback(text):
+            pllm_profiler.tock()
             if generating:
                 completion.append(text)
                 new_tokens = completion.get_new_tokens()
@@ -368,6 +413,7 @@ def llm_task(text):
                     elif message['command'] == Commands.PROCESS:
                         generating = True
                         text = message['text']
+                        pllm_profiler.reset()
                         llm_future = executor.submit(llm_task, text)
                     elif message['command'] == Commands.INTERRUPT:
                         interrupting = True
@@ -381,7 +427,7 @@ def llm_task(text):
                         interrupting = False
                         connection.send({'command': Commands.INTERRUPT})
                     else:
-                        connection.send({'command': Commands.FLUSH})
+                        connection.send({'command': Commands.FLUSH, 'profile': pllm_profiler.tps()})
                     llm_future = None
                 if not llm_future and interrupting:
                     interrupting = False
@@ -398,10 +444,14 @@ def __init__(
             self,
             generator: Generator,
             porcupine: pvporcupine.Porcupine,
-            cheetah: pvcheetah.Cheetah):
+            cheetah: pvcheetah.Cheetah,
+            config):
         self.generator = generator
         self.porcupine = porcupine
         self.cheetah = cheetah
+        self.config = config
+        self.porcupine_profiler = RTFProfiler(porcupine.sample_rate)
+        self.cheetah_profiler = RTFProfiler(cheetah.sample_rate)
 
         self.sleeping = True
         self.listening = False
@@ -413,23 +463,37 @@ def close(self):
 
     def process(self, pcm: Optional[Sequence[int]]):
         if self.sleeping:
-            if self.porcupine.process(pcm) == 0:
+            self.porcupine_profiler.tick()
+            wake_word_detected = self.porcupine.process(pcm) == 0
+            self.porcupine_profiler.tock(pcm)
+            if wake_word_detected:
                 self.sleeping = False
                 self.tick_count = 4
                 self.generator.interrupt()
+                if self.config['profile']:
+                    print(f'[Porcupine RTF: {round(self.porcupine_profiler.rtf(), 2)}]')
+                self.porcupine_profiler.reset()
+                self.cheetah_profiler.reset()
         elif self.listening:
+            self.cheetah_profiler.tick()
             partial_transcript, endpoint_reached = self.cheetah.process(pcm)
+            self.cheetah_profiler.tock(pcm)
             if len(partial_transcript) > 0:
                 self.user_request += partial_transcript
                 print(partial_transcript, end='', flush=True)
             if endpoint_reached:
+                utterance_end_sec = time.perf_counter()
                 self.sleeping = True
                 self.listening = False
+                self.cheetah_profiler.tick()
                 remaining_transcript = self.cheetah.flush()
+                self.cheetah_profiler.tock()
                 if len(remaining_transcript) > 0:
                     self.user_request += remaining_transcript
                 print(remaining_transcript, flush=True)
-                self.generator.process(self.user_request)
+                if self.config['profile']:
+                    print(f'[Cheetah RTF: {round(self.cheetah_profiler.rtf(), 2)}]')
+                self.generator.process(self.user_request, utterance_end_sec)
                 self.user_request = ''
         elif self.tick_count > 0:
             self.tick_count -= 1
@@ -482,18 +546,28 @@ def handler(_, __) -> None:
             sensitivities=[config['porcupine_sensitivity']])
         config['ppn_prompt'] = 'the wake word'
 
+    print(f"→ Porcupine v{porcupine.version}")
+
     cheetah = pvcheetah.create(
         access_key=config['access_key'],
         endpoint_duration_sec=config['cheetah_endpoint_duration_sec'],
         enable_automatic_punctuation=True)
+
+    print(f"→ Cheetah v{cheetah.version}")
 
     pv_recorder = PvRecorder(frame_length=porcupine.frame_length)
     pv_speaker = PvSpeaker(sample_rate=int(orca_connection.recv()), bits_per_sample=16, buffer_size_secs=1)
 
-    speaker = Speaker(pv_speaker, config['orca_warmup_sec'])
-    synthesizer = Synthesizer(speaker, orca_connection, orca_process)
-    generator = Generator(synthesizer, pllm_connection, pllm_process)
-    listener = Listener(generator, porcupine, cheetah)
+    pllm_info = pllm_connection.recv()
+    print(f"→ picoLLM v{pllm_info['version']} <{pllm_info['model']}>")
+
+    orca_info = orca_connection.recv()
+    print(f"→ Orca v{orca_info['version']}")
+
+    speaker = Speaker(pv_speaker, config)
+    synthesizer = Synthesizer(speaker, orca_connection, orca_process, config)
+    generator = Generator(synthesizer, pllm_connection, pllm_process, config)
+    listener = Listener(generator, porcupine, cheetah, config)
     recorder = Recorder(listener, pv_recorder)
 
     ppn_prompt = config['ppn_prompt']

diff --git a/recipes/llm-voice-assistant/python/cli/requirements.txt b/recipes/llm-voice-assistant/python/cli/requirements.txt
@@ -0,0 +1,6 @@
+picollm==1.2.3
+pvcheetah==2.0.1
+pvorca==1.0.0
+pvporcupine==3.0.2
+pvrecorder==1.2.2
+pvspeaker==1.0.3
diff --git a/recipes/llm-voice-assistant/python/windows_gui/README.md b/recipes/llm-voice-assistant/python/windows_gui/README.md
@@ -0,0 +1,45 @@
+## Compatibility
+
+- Python 3.8+
+- Runs on Windows (x86_64).
+
+## AccessKey
+
+AccessKey is your authentication and authorization token for deploying Picovoice SDKs, including picoLLM. Anyone who is
+using Picovoice needs to have a valid AccessKey. You must keep your AccessKey secret. You would need internet
+connectivity to validate your AccessKey with Picovoice license servers even though the LLM inference is running 100%
+offline and completely free for open-weight models. Everyone who signs up for
+[Picovoice Console](https://console.picovoice.ai/) receives a unique AccessKey.
+
+## picoLLM Model
+
+picoLLM Inference Engine supports many open-weight models. The models are on
+[Picovoice Console](https://console.picovoice.ai/).
+
+## Usage
+
+Install the required packages:
+
+```console
+pip install -r requirements.txt
+```
+
+Run the demo:
+
+```console
+python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} 
+```
+
+Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the 
+model downloaded from Picovoice Console.
+
+To see all available options, type the following:
+
+```console
+python main.py --help
+```
+
+## Custom Wake Word
+
+The demo's default wake phrase is `Jarvis`. You can generate your custom (branded) wake word using Picovoice  Console by following [Porcupine Wake Word documentation (https://picovoice.ai/docs/porcupine/). Once you have the model trained, simply pass it to the demo
+application using `--keyword_model_path` argument.
diff --git a/...llm-voice-assistant/python/windows_gui.py → ...oice-assistant/python/windows_gui/main.py b/...llm-voice-assistant/python/windows_gui.py → ...oice-assistant/python/windows_gui/main.py
diff --git a/...m-voice-assistant/python/requirements.txt → ...stant/python/windows_gui/requirements.txt b/...m-voice-assistant/python/requirements.txt → ...stant/python/windows_gui/requirements.txt