From 131d2661108b3e46f84aa467f45eb73961421359 Mon Sep 17 00:00:00 2001
From: matt200-ok <matthew@picovoice.ai>
Date: Thu, 2 Jan 2025 14:11:34 -0800
Subject: [PATCH] Python interrupt fix (#27)

---
 .../python/{ => cli}/README.md                |  15 +
 .../llm-voice-assistant/python/cli/main.py    | 724 ++++++++++++++++++
 .../python/cli/requirements.txt               |   6 +
 recipes/llm-voice-assistant/python/main.py    | 458 -----------
 .../python/windows_gui/README.md              |  60 ++
 .../{windows_gui.py => windows_gui/main.py}   |   0
 .../python/{ => windows_gui}/requirements.txt |   2 +-
 res/.lint/spell-check/dict.txt                |   1 +
 8 files changed, 807 insertions(+), 459 deletions(-)
 rename recipes/llm-voice-assistant/python/{ => cli}/README.md (84%)
 create mode 100644 recipes/llm-voice-assistant/python/cli/main.py
 create mode 100644 recipes/llm-voice-assistant/python/cli/requirements.txt
 delete mode 100644 recipes/llm-voice-assistant/python/main.py
 create mode 100644 recipes/llm-voice-assistant/python/windows_gui/README.md
 rename recipes/llm-voice-assistant/python/{windows_gui.py => windows_gui/main.py} (100%)
 rename recipes/llm-voice-assistant/python/{ => windows_gui}/requirements.txt (90%)

diff --git a/recipes/llm-voice-assistant/python/README.md b/recipes/llm-voice-assistant/python/cli/README.md
similarity index 84%
rename from recipes/llm-voice-assistant/python/README.md
rename to recipes/llm-voice-assistant/python/cli/README.md
index 3bd2cb1..7c64a06 100644
--- a/recipes/llm-voice-assistant/python/README.md
+++ b/recipes/llm-voice-assistant/python/cli/README.md
@@ -1,3 +1,7 @@
+# Cross-Platform LLM Voice Assistant CLI Demo
+
+A cross-platform voice assistant using Picovoice's Wake Word, STT, TTS and LLM technology with a text-based interface.
+
 ## See It In Action!
 
 [![LLM VA in Action](https://img.youtube.com/vi/06K_YtUr8mc/0.jpg)](https://www.youtube.com/watch?v=06K_YtUr8mc)
@@ -43,6 +47,17 @@ To see all available options, type the following:
 python main.py --help
 ```
 
+## Config File
+
+In addition to command line arguments a config file can be used to pass arguments to the demo. By default the demo looks for `config.json` in the same directory as `main.py` but an alternative path can be passed using the `--config` option. Below is an example config file.
+
+```json
+{
+    "access_key": "${ACCESS_KEY}",
+    "picollm_model_path": "${PICOLLM_MODEL_PATH}"
+}
+```
+
 ## Custom Wake Word
 
 The demo's default wake phrase is `Picovoice`. You can generate your custom (branded) wake word using Picovoice  Console by following [Porcupine Wake Word documentation (https://picovoice.ai/docs/porcupine/). Once you have the model trained, simply pass it to the demo
diff --git a/recipes/llm-voice-assistant/python/cli/main.py b/recipes/llm-voice-assistant/python/cli/main.py
new file mode 100644
index 0000000..4be536e
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/cli/main.py
@@ -0,0 +1,724 @@
+import json
+import os
+import signal
+import sys
+import time
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor
+from itertools import chain
+from multiprocessing import Event, Pipe, Process, Queue, active_children
+from multiprocessing.connection import Connection
+from typing import Optional, Sequence
+
+
+import picollm
+import pvcheetah
+import pvorca
+import pvporcupine
+from pvrecorder import PvRecorder
+from pvspeaker import PvSpeaker
+
+
+class Commands:
+    START = 'start'
+    CLOSE = 'close'
+    PROCESS = 'process'
+    SYNTHESIZE = 'synthesize'
+    SPEAK = 'speak'
+    FLUSH = 'flush'
+    INTERRUPT = 'interrupt'
+
+
+class RTFProfiler:
+    def __init__(self, sample_rate: int) -> None:
+        self._sample_rate = sample_rate
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        self._tick_sec = 0.
+
+    def tick(self) -> None:
+        self._tick_sec = time.perf_counter()
+
+    def tock(self, audio: Optional[Sequence[int]] = None) -> None:
+        self._compute_sec += time.perf_counter() - self._tick_sec
+        self._audio_sec += (len(audio) / self._sample_rate) if audio is not None else 0.
+
+    def rtf(self) -> float:
+        if self._audio_sec > 0:
+            rtf = self._compute_sec / self._audio_sec
+        else:
+            rtf = 0
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        return rtf
+
+    def reset(self) -> None:
+        self._compute_sec = 0.
+        self._audio_sec = 0.
+        self._tick_sec = 0.
+
+
+class TPSProfiler(object):
+    def __init__(self) -> None:
+        self._num_tokens = 0
+        self._start_sec = 0.
+
+    def tock(self) -> None:
+        if self._start_sec == 0.:
+            self._start_sec = time.perf_counter()
+        else:
+            self._num_tokens += 1
+
+    def tps(self) -> float:
+        tps = self._num_tokens / (time.perf_counter() - self._start_sec)
+        self._num_tokens = 0
+        self._start_sec = 0.
+        return tps
+
+    def reset(self) -> None:
+        self._num_tokens = 0
+        self._start_sec = 0.
+
+
+class CompletionText(object):
+    def __init__(self, stop_phrases: list) -> None:
+        self.stop_phrases = stop_phrases
+        self.start: int = 0
+        self.text: str = ''
+        self.new_tokens: str = ''
+
+    def reset(self):
+        self.start: int = 0
+        self.text: str = ''
+        self.new_tokens: str = ''
+
+    def append(self, text: str) -> None:
+        self.text += text
+        end = len(self.text)
+
+        for stop_phrase in self.stop_phrases:
+            if stop_phrase in self.text:
+                contains = self.text.index(stop_phrase)
+                if end > contains:
+                    end = contains
+            for i in range(len(stop_phrase) - 1, 0, -1):
+                if self.text.endswith(stop_phrase[:i]):
+                    ends = len(self.text) - i
+                    if end > ends:
+                        end = ends
+                    break
+
+        start = self.start
+        self.start = end
+        self.new_tokens = self.text[start:end]
+
+    def get_new_tokens(self) -> str:
+        return self.new_tokens
+
+
+class Speaker:
+    def __init__(
+            self,
+            speaker: PvSpeaker,
+            config):
+        self.speaker = speaker
+        self.config = config
+        self.orca_warmup = self.speaker.sample_rate * self.config['orca_warmup_sec']
+        self.started = False
+        self.speaking = False
+        self.flushing = False
+        self.pcmBuffer = []
+        self.executor = ThreadPoolExecutor()
+        self.future = None
+
+    def close(self):
+        self.executor.shutdown()
+
+    def start(self):
+        self.started = True
+
+    def process(self, pcm: Optional[Sequence[int]]):
+        if self.started and pcm is not None:
+            self.pcmBuffer.extend(pcm)
+
+    def flush(self):
+        self.flushing = True
+
+    def interrupt(self):
+        self.started = False
+        if self.speaking:
+            self.speaking = False
+            self.flushing = False
+            self.pcmBuffer.clear()
+            self.speaker.stop()
+
+    def tick(self):
+        def stop():
+            self.speaker.flush()
+            self.speaker.stop()
+        if not self.speaking and len(self.pcmBuffer) > self.orca_warmup:
+            self.speaking = True
+            self.speaker.start()
+        if self.speaking and len(self.pcmBuffer) > 0:
+            written = self.speaker.write(self.pcmBuffer)
+            if written > 0:
+                del self.pcmBuffer[:written]
+        elif self.speaking and self.flushing and len(self.pcmBuffer) == 0:
+            self.started = False
+            self.speaking = False
+            self.flushing = False
+            self.future = self.executor.submit(stop)
+        if self.future and self.future.done():
+            self.future = None
+            ppn_prompt = self.config['ppn_prompt']
+            print(f'$ Say {ppn_prompt} ...', flush=True)
+
+
+class Synthesizer:
+    def __init__(
+            self,
+            speaker: Speaker,
+            orca_connection: Connection,
+            orca_process: Process,
+            config):
+        self.speaker = speaker
+        self.orca_connection = orca_connection
+        self.orca_process = orca_process
+        self.config = config
+
+    def close(self):
+        self.orca_connection.send({'command': Commands.CLOSE})
+        self.orca_process.join()
+
+    def start(self, utterance_end_sec):
+        self.speaker.start()
+        self.orca_connection.send({'command': Commands.START, 'utterance_end_sec': utterance_end_sec})
+
+    def process(self, text: str):
+        self.orca_connection.send({'command': Commands.PROCESS, 'text': text})
+
+    def flush(self):
+        self.orca_connection.send({'command': Commands.FLUSH})
+
+    def interrupt(self):
+        self.orca_connection.send({'command': Commands.INTERRUPT})
+        while self.orca_connection.poll() and self.orca_connection.recv()['command'] != Commands.INTERRUPT:
+            time.sleep(0.01)
+        self.speaker.interrupt()
+
+    def tick(self):
+        while self.orca_connection.poll():
+            message = self.orca_connection.recv()
+            if message['command'] == Commands.SPEAK:
+                self.speaker.process(message['pcm'])
+            elif message['command'] == Commands.FLUSH:
+                if self.config['profile']:
+                    rtf = message['profile']
+                    delay = message['delay']
+                    print(f'[Orca RTF: {round(rtf, 2)}]')
+                    print(f"[Delay: {round(delay, 2)} sec]")
+                self.speaker.flush()
+
+    @staticmethod
+    def create_worker(config):
+        main_connection, process_connection = Pipe()
+        process = Process(target=Synthesizer.worker, args=(process_connection, config))
+        process.start()
+        return main_connection, process
+
+    @staticmethod
+    def worker(connection: Connection, config):
+        def handler(_, __) -> None:
+            pass
+        signal.signal(signal.SIGINT, handler)
+
+        orca = pvorca.create(access_key=config['access_key'])
+        orca_stream = orca.stream_open()
+        connection.send(orca.sample_rate)
+        connection.send({'version': orca.version})
+
+        orca_profiler = RTFProfiler(orca.sample_rate)
+        utterance_end_sec = 0
+        delay_sec = -1
+
+        try:
+            close = False
+            synthesizing = False
+            flushing = False
+            text_queue = Queue()
+            while not close:
+                while connection.poll():
+                    message = connection.recv()
+                    if message['command'] == Commands.CLOSE:
+                        close = True
+                    elif message['command'] == Commands.START:
+                        synthesizing = True
+                        utterance_end_sec = message['utterance_end_sec']
+                    elif message['command'] == Commands.PROCESS:
+                        if synthesizing:
+                            text_queue.put(message['text'])
+                    elif message['command'] == Commands.FLUSH:
+                        flushing = True
+                    elif message['command'] == Commands.INTERRUPT:
+                        synthesizing = False
+                        flushing = False
+                        while not text_queue.empty():
+                            text_queue.get()
+                        orca_stream.flush()
+                        connection.send({'command': Commands.INTERRUPT})
+                        orca_profiler.reset()
+                        utterance_end_sec = 0
+                        delay_sec = -1
+                if not text_queue.empty():
+                    text = text_queue.get()
+                    orca_profiler.tick()
+                    pcm = orca_stream.synthesize(text)
+                    orca_profiler.tock(pcm)
+                    if pcm is not None:
+                        connection.send({'command': Commands.SPEAK, 'pcm': pcm})
+                        if delay_sec == -1:
+                            delay_sec = time.perf_counter() - utterance_end_sec
+                if synthesizing and flushing and text_queue.empty():
+                    synthesizing = False
+                    flushing = False
+                    orca_profiler.tick()
+                    pcm = orca_stream.flush()
+                    orca_profiler.tock(pcm)
+                    connection.send({'command': Commands.SPEAK, 'pcm': pcm})
+                    connection.send({'command': Commands.FLUSH, 'profile': orca_profiler.rtf(), 'delay': delay_sec})
+                    utterance_end_sec = 0
+                    delay_sec = -1
+                elif flushing:
+                    flushing = False
+        finally:
+            orca_stream.close()
+            orca.delete()
+
+
+class Generator:
+    def __init__(
+            self,
+            synthesizer: Synthesizer,
+            pllm_connection: Connection,
+            pllm_process: Process,
+            config):
+        self.synthesizer = synthesizer
+        self.pllm_connection = pllm_connection
+        self.pllm_process = pllm_process
+        self.config = config
+
+    def close(self):
+        self.pllm_connection.send({'command': Commands.CLOSE})
+        self.pllm_process.join()
+
+    def process(self, text: str, utterance_end_sec):
+        ppn_prompt = self.config['ppn_prompt']
+        print(f'LLM (say ${ppn_prompt} to interrupt) > ', end='', flush=True)
+
+        self.synthesizer.start(utterance_end_sec)
+        self.pllm_connection.send({'command': Commands.PROCESS, 'text': text})
+
+    def interrupt(self):
+        self.pllm_connection.send({'command': Commands.INTERRUPT})
+        while self.pllm_connection.poll() and self.pllm_connection.recv()['command'] != Commands.INTERRUPT:
+            time.sleep(0.01)
+        print('', flush=True)
+        self.synthesizer.interrupt()
+
+    def tick(self):
+        while self.pllm_connection.poll():
+            message = self.pllm_connection.recv()
+            if message['command'] == Commands.SYNTHESIZE:
+                print(message['text'], end='', flush=True)
+                self.synthesizer.process(message['text'])
+            elif message['command'] == Commands.FLUSH:
+                print('', flush=True)
+                if self.config['profile']:
+                    tps = message['profile']
+                    print(f'[picoLLM TPS: {round(tps, 2)}]')
+                self.synthesizer.flush()
+
+    @staticmethod
+    def create_worker(config):
+        main_connection, process_connection = Pipe()
+        process = Process(target=Generator.worker, args=(process_connection, config))
+        process.start()
+        return main_connection, process
+
+    @staticmethod
+    def worker(connection: Connection, config):
+        def handler(_, __) -> None:
+            pass
+        signal.signal(signal.SIGINT, handler)
+
+        pllm = picollm.create(
+            access_key=config['access_key'],
+            model_path=config['picollm_model_path'],
+            device=config['picollm_device'])
+
+        connection.send({'version': pllm.version, 'model': pllm.model})
+
+        if config['picollm_system_prompt'] is not None:
+            dialog = pllm.get_dialog(system=config['picollm_system_prompt'])
+        else:
+            dialog = pllm.get_dialog()
+        generating = False
+
+        pllm_profiler = TPSProfiler()
+
+        stop_phrases = {
+            '</s>',  # Llama-2, Mistral, and Mixtral
+            '<end_of_turn>',  # Gemma
+            '<|endoftext|>',  # Phi-2
+            '<|eot_id|>',  # Llama-3
+            '<|end|>', '<|user|>', '<|assistant|>',  # Phi-3
+        }
+        completion = CompletionText(stop_phrases)
+
+        def llm_callback(text):
+            pllm_profiler.tock()
+            if generating:
+                completion.append(text)
+                new_tokens = completion.get_new_tokens()
+                if len(new_tokens) > 0:
+                    connection.send({'command': Commands.SYNTHESIZE, 'text': new_tokens})
+
+        def llm_task(text):
+            short_answers_instruction = \
+                "You are a voice assistant and your answers are very short but informative"
+            dialog.add_human_request(
+                f"{short_answers_instruction}. {text}" if config['short_answers'] else text)
+
+            completion.reset()
+            return pllm.generate(
+                prompt=dialog.prompt(),
+                completion_token_limit=config['picollm_completion_token_limit'],
+                stop_phrases=stop_phrases,
+                presence_penalty=config['picollm_presence_penalty'],
+                frequency_penalty=config['picollm_frequency_penalty'],
+                temperature=config['picollm_temperature'],
+                top_p=config['picollm_top_p'],
+                stream_callback=llm_callback)
+
+        try:
+            close = False
+            executor = ThreadPoolExecutor()
+            llm_future = None
+            interrupting = False
+            while not close:
+                while connection.poll():
+                    message = connection.recv()
+                    if message['command'] == Commands.CLOSE:
+                        close = True
+                    elif message['command'] == Commands.PROCESS:
+                        generating = True
+                        text = message['text']
+                        pllm_profiler.reset()
+                        llm_future = executor.submit(llm_task, text)
+                    elif message['command'] == Commands.INTERRUPT:
+                        interrupting = True
+                        generating = False
+                        pllm.interrupt()
+                if llm_future and llm_future.done():
+                    generating = False
+                    llm_result = llm_future.result()
+                    dialog.add_llm_response(llm_result.completion)
+                    if llm_result.endpoint == picollm.PicoLLMEndpoints.INTERRUPTED:
+                        interrupting = False
+                        connection.send({'command': Commands.INTERRUPT})
+                    else:
+                        connection.send({'command': Commands.FLUSH, 'profile': pllm_profiler.tps()})
+                    llm_future = None
+                if not llm_future and interrupting:
+                    interrupting = False
+                    connection.send({'command': Commands.INTERRUPT})
+        finally:
+            while llm_future and llm_future.done():
+                time.sleep(0.01)
+            del executor
+            pllm.release()
+
+
+class Listener:
+    def __init__(
+            self,
+            generator: Generator,
+            porcupine: pvporcupine.Porcupine,
+            cheetah: pvcheetah.Cheetah,
+            config):
+        self.generator = generator
+        self.porcupine = porcupine
+        self.cheetah = cheetah
+        self.config = config
+        self.porcupine_profiler = RTFProfiler(porcupine.sample_rate)
+        self.cheetah_profiler = RTFProfiler(cheetah.sample_rate)
+
+        self.sleeping = True
+        self.listening = False
+        self.user_request = ''
+        self.tick_count = 0
+
+    def close(self):
+        pass
+
+    def process(self, pcm: Optional[Sequence[int]]):
+        if self.sleeping:
+            self.porcupine_profiler.tick()
+            wake_word_detected = self.porcupine.process(pcm) == 0
+            self.porcupine_profiler.tock(pcm)
+            if wake_word_detected:
+                self.sleeping = False
+                self.tick_count = 4
+                self.generator.interrupt()
+                if self.config['profile']:
+                    print(f'[Porcupine RTF: {round(self.porcupine_profiler.rtf(), 2)}]')
+                self.porcupine_profiler.reset()
+                self.cheetah_profiler.reset()
+        elif self.listening:
+            self.cheetah_profiler.tick()
+            partial_transcript, endpoint_reached = self.cheetah.process(pcm)
+            self.cheetah_profiler.tock(pcm)
+            if len(partial_transcript) > 0:
+                self.user_request += partial_transcript
+                print(partial_transcript, end='', flush=True)
+            if endpoint_reached:
+                utterance_end_sec = time.perf_counter()
+                self.sleeping = True
+                self.listening = False
+                self.cheetah_profiler.tick()
+                remaining_transcript = self.cheetah.flush()
+                self.cheetah_profiler.tock()
+                if len(remaining_transcript) > 0:
+                    self.user_request += remaining_transcript
+                print(remaining_transcript, flush=True)
+                if self.config['profile']:
+                    print(f'[Cheetah RTF: {round(self.cheetah_profiler.rtf(), 2)}]')
+                self.generator.process(self.user_request, utterance_end_sec)
+                self.user_request = ''
+        elif self.tick_count > 0:
+            self.tick_count -= 1
+        else:
+            self.listening = True
+            print('\n$ Wake word detected, utter your request or question ...', flush=True)
+            print('User > ', end='', flush=True)
+
+
+class Recorder:
+    def __init__(
+            self,
+            listener: Listener,
+            recorder: PvRecorder):
+        self.listener = listener
+        self.recorder = recorder
+        self.recording = False
+
+    def close(self):
+        if self.recording:
+            self.recorder.stop()
+
+    def tick(self):
+        if not self.recording:
+            self.recording = True
+            self.recorder.start()
+        pcm = self.recorder.read()
+        self.listener.process(pcm)
+
+
+def main(config):
+    stop = [False]
+
+    def handler(_, __) -> None:
+        stop[0] = True
+    signal.signal(signal.SIGINT, handler)
+
+    pllm_connection, pllm_process = Generator.create_worker(config)
+    orca_connection, orca_process = Synthesizer.create_worker(config)
+
+    if 'keyword_model_path' not in config:
+        porcupine = pvporcupine.create(
+            access_key=config['access_key'],
+            keywords=['picovoice'],
+            sensitivities=[config['porcupine_sensitivity']])
+        config['ppn_prompt'] = '`Picovoice`'
+    else:
+        porcupine = pvporcupine.create(
+            access_key=config['access_key'],
+            keyword_paths=[config['keyword_model_path']],
+            sensitivities=[config['porcupine_sensitivity']])
+        config['ppn_prompt'] = 'the wake word'
+
+    print(f"→ Porcupine v{porcupine.version}")
+
+    cheetah = pvcheetah.create(
+        access_key=config['access_key'],
+        endpoint_duration_sec=config['cheetah_endpoint_duration_sec'],
+        enable_automatic_punctuation=True)
+
+    print(f"→ Cheetah v{cheetah.version}")
+
+    pv_recorder = PvRecorder(frame_length=porcupine.frame_length)
+    pv_speaker = PvSpeaker(sample_rate=int(orca_connection.recv()), bits_per_sample=16, buffer_size_secs=1)
+
+    pllm_info = pllm_connection.recv()
+    print(f"→ picoLLM v{pllm_info['version']} <{pllm_info['model']}>")
+
+    orca_info = orca_connection.recv()
+    print(f"→ Orca v{orca_info['version']}")
+
+    speaker = Speaker(pv_speaker, config)
+    synthesizer = Synthesizer(speaker, orca_connection, orca_process, config)
+    generator = Generator(synthesizer, pllm_connection, pllm_process, config)
+    listener = Listener(generator, porcupine, cheetah, config)
+    recorder = Recorder(listener, pv_recorder)
+
+    ppn_prompt = config['ppn_prompt']
+    print(f'$ Say {ppn_prompt} ...', flush=True)
+
+    try:
+        while not stop[0]:
+            recorder.tick()
+            generator.tick()
+            synthesizer.tick()
+            speaker.tick()
+    finally:
+        generator.interrupt()
+        generator.tick()
+        synthesizer.tick()
+        speaker.tick()
+        recorder.close()
+        listener.close()
+        generator.close()
+        synthesizer.close()
+        speaker.close()
+
+        for child in active_children():
+            child.terminate()
+
+        porcupine.delete()
+        cheetah.delete()
+        pv_recorder.delete()
+        pv_speaker.delete()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument(
+        '--config',
+        help='path to a json config file to load the arguments from')
+    parser.add_argument(
+        '--access_key',
+        help='`AccessKey` obtained from `Picovoice Console` (https://console.picovoice.ai/).')
+    parser.add_argument(
+        '--picollm_model_path',
+        help='Absolute path to the file containing LLM parameters (`.pllm`).')
+    parser.add_argument(
+        '--keyword-model_path',
+        help='Absolute path to the keyword model file (`.ppn`). If not set, `Picovoice` will be the wake phrase')
+    parser.add_argument(
+        '--cheetah_endpoint_duration_sec',
+        type=float,
+        help="Duration of silence (pause) after the user's utterance to consider it the end of the utterance.")
+    parser.add_argument(
+        '--picollm_device',
+        help="String representation of the device (e.g., CPU or GPU) to use for inference. If set to `best`, picoLLM "
+             "picks the most suitable device. If set to `gpu`, the engine uses the first available GPU device. To "
+             "select a specific GPU device, set this argument to `gpu:${GPU_INDEX}`, where `${GPU_INDEX}` is the index "
+             "of the target GPU. If set to `cpu`, the engine will run on the CPU with the default number of threads. "
+             "To specify the number of threads, set this argument to `cpu:${NUM_THREADS}`, where `${NUM_THREADS}` is "
+             "the desired number of threads.")
+    parser.add_argument(
+        '--picollm_completion_token_limit',
+        type=int,
+        help="Maximum number of tokens in the completion. Set to `None` to impose no limit.")
+    parser.add_argument(
+        '--picollm_presence_penalty',
+        type=float,
+        help="It penalizes logits already appearing in the partial completion if set to a positive value. If set to "
+             "`0.0`, it has no effect.")
+    parser.add_argument(
+        '--picollm_frequency_penalty',
+        type=float,
+        help="If set to a positive floating-point value, it penalizes logits proportional to the frequency of their "
+             "appearance in the partial completion. If set to `0.0`, it has no effect.")
+    parser.add_argument(
+        '--picollm_temperature',
+        type=float,
+        help="Sampling temperature. Temperature is a non-negative floating-point value that controls the randomness of "
+             "the sampler. A higher temperature smoothens the samplers' output, increasing the randomness. In "
+             "contrast, a lower temperature creates a narrower distribution and reduces variability. Setting it to "
+             "`0` selects the maximum logit during sampling.")
+    parser.add_argument(
+        '--picollm_top_p',
+        type=float,
+        help="A positive floating-point number within (0, 1]. It restricts the sampler's choices to high-probability "
+             "logits that form the `top_p` portion of the probability mass. Hence, it avoids randomly selecting "
+             "unlikely logits. A value of `1.` enables the sampler to pick any token with non-zero probability, "
+             "turning off the feature.")
+    parser.add_argument(
+        '--picollm_system_prompt',
+        type=str,
+        help="A text prompt to give to the llm prior to it's input to instruct it on how to behave."
+    )
+    parser.add_argument(
+        '--orca_warmup_sec',
+        type=float,
+        help="Duration of the synthesized audio to buffer before streaming it out. A higher value helps slower "
+             "(e.g., Raspberry Pi) to keep up with real-time at the cost of increasing the initial delay.")
+    parser.add_argument(
+        '--porcupine_sensitivity',
+        type=float,
+        help="Sensitivity for detecting keywords.")
+    parser.add_argument('--short_answers', action='store_true')
+    parser.add_argument('--profile', action='store_true', help='Show runtime profiling information.')
+    args = parser.parse_args()
+
+    if args.config is not None:
+        config_path = os.path.realpath(args.config)
+    else:
+        config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'config.json')
+
+    if os.path.exists(config_path):
+        with open(config_path, 'r') as fd:
+            config = json.load(fd)
+    elif args.config is not None:
+        print(parser.error(f'File {config_path} does not exist'))
+        exit(1)
+    else:
+        config = {}
+
+    REQUIRED_ARGS = [
+        'access_key',
+        'picollm_model_path'
+    ]
+    DEFAULT_ARGS = {
+        'access_key': '',
+        'picollm_model_path': '',
+        'cheetah_endpoint_duration_sec': 1,
+        'picollm_device': 'best',
+        'picollm_completion_token_limit': 256,
+        'picollm_presence_penalty': 0,
+        'picollm_frequency_penalty': 0,
+        'picollm_temperature': 0,
+        'picollm_top_p': 1,
+        'picollm_system_prompt': None,
+        'orca_warmup_sec': 0,
+        'porcupine_sensitivity': 0.5,
+        'short_answers': False,
+        'profile': False
+    }
+
+    for key in chain(REQUIRED_ARGS, DEFAULT_ARGS):
+        arg = getattr(args, key)
+        if arg is not None:
+            config[key] = arg
+
+    missing = [f'--{arg}' for arg in REQUIRED_ARGS if arg not in config]
+    if len(missing) > 0:
+        print(parser.error('the following arguments are required: ' + ', '.join(missing)))
+        exit(1)
+
+    for key in DEFAULT_ARGS:
+        if key not in config:
+            config[key] = DEFAULT_ARGS[key]
+
+    main(config)
diff --git a/recipes/llm-voice-assistant/python/cli/requirements.txt b/recipes/llm-voice-assistant/python/cli/requirements.txt
new file mode 100644
index 0000000..5f73eac
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/cli/requirements.txt
@@ -0,0 +1,6 @@
+picollm==1.2.3
+pvcheetah==2.1.0
+pvorca==1.0.0
+pvporcupine==3.0.2
+pvrecorder==1.2.2
+pvspeaker==1.0.3
\ No newline at end of file
diff --git a/recipes/llm-voice-assistant/python/main.py b/recipes/llm-voice-assistant/python/main.py
deleted file mode 100644
index 054669a..0000000
--- a/recipes/llm-voice-assistant/python/main.py
+++ /dev/null
@@ -1,458 +0,0 @@
-import signal
-import concurrent.futures
-import time
-from argparse import ArgumentParser
-from collections import deque
-from itertools import chain
-from multiprocessing import (
-    Pipe,
-    Process,
-)
-from typing import (
-    Optional,
-    Sequence,
-)
-
-import picollm
-import pvcheetah
-import pvorca
-import pvporcupine
-from pvrecorder import PvRecorder
-from pvspeaker import PvSpeaker
-
-
-class RTFProfiler:
-    def __init__(self, sample_rate: int) -> None:
-        self._sample_rate = sample_rate
-        self._compute_sec = 0.
-        self._audio_sec = 0.
-        self._tick_sec = 0.
-
-    def tick(self) -> None:
-        self._tick_sec = time.perf_counter()
-
-    def tock(self, audio: Optional[Sequence[int]] = None) -> None:
-        self._compute_sec += time.perf_counter() - self._tick_sec
-        self._audio_sec += (len(audio) / self._sample_rate) if audio is not None else 0.
-
-    def rtf(self) -> float:
-        rtf = self._compute_sec / self._audio_sec
-        self._compute_sec = 0.
-        self._audio_sec = 0.
-        return rtf
-
-
-class TPSProfiler(object):
-    def __init__(self) -> None:
-        self._num_tokens = 0
-        self._start_sec = 0.
-
-    def tock(self) -> None:
-        if self._start_sec == 0.:
-            self._start_sec = time.perf_counter()
-        else:
-            self._num_tokens += 1
-
-    def tps(self) -> float:
-        tps = self._num_tokens / (time.perf_counter() - self._start_sec)
-        self._num_tokens = 0
-        self._start_sec = 0.
-        return tps
-
-
-class CompletionText(object):
-    def __init__(self, stop_phrases: list) -> None:
-        self.stop_phrases = stop_phrases
-        self.start: int = 0
-        self.text: str = ''
-        self.new_tokens: str = ''
-
-    def append(self, text: str) -> None:
-        self.text += text
-        end = len(self.text)
-
-        for stop_phrase in self.stop_phrases:
-            if stop_phrase in self.text:
-                contains = self.text.index(stop_phrase)
-                if end > contains:
-                    end = contains
-            for i in range(len(stop_phrase) - 1, 0, -1):
-                if self.text.endswith(stop_phrase[:i]):
-                    ends = len(self.text) - i
-                    if end > ends:
-                        end = ends
-                    break
-
-        start = self.start
-        self.start = end
-        self.new_tokens = self.text[start:end]
-
-    def get_new_tokens(self) -> str:
-        return self.new_tokens
-
-
-def orca_worker(access_key: str, connection, warmup_sec: float, stream_frame_sec: int = 0.03) -> None:
-    orca = pvorca.create(access_key=access_key)
-    orca_stream = orca.stream_open()
-
-    texts = list()
-    pcm_deque = deque()
-    warmup = [False]
-    synthesize = False
-    flush = False
-    close = False
-    interrupt = False
-    utterance_end_sec = 0.
-    delay_sec = [-1.]
-
-    speaker = PvSpeaker(sample_rate=orca.sample_rate, bits_per_sample=16, buffer_size_secs=20)
-
-    connection.send({'version': orca.version})
-
-    orca_profiler = RTFProfiler(orca.sample_rate)
-
-    def buffer_pcm(pcm_chunk: Optional[Sequence[int]]) -> None:
-        if pcm_chunk is not None:
-            if delay_sec[0] == -1:
-                delay_sec[0] = time.perf_counter() - utterance_end_sec
-
-            pcm_deque.append(pcm_chunk)
-
-    def play_buffered_pcm() -> None:
-        if warmup[0]:
-            if len(list(chain.from_iterable(pcm_deque))) < int(warmup_sec * orca.sample_rate):
-                return
-            else:
-                warmup[0] = False
-
-        if len(pcm_deque) > 0:
-            pcm_chunk = list(chain.from_iterable(pcm_deque))
-            pcm_deque.clear()
-
-            written = speaker.write(pcm_chunk)
-            if written < len(pcm_chunk):
-                pcm_deque.appendleft(pcm_chunk[written:])
-
-    while True:
-        if synthesize and len(texts) > 0:
-            orca_profiler.tick()
-            pcm = orca_stream.synthesize(texts.pop(0))
-            orca_profiler.tock(pcm)
-            buffer_pcm(pcm)
-            play_buffered_pcm()
-        elif flush:
-            while len(texts) > 0:
-                orca_profiler.tick()
-                pcm = orca_stream.synthesize(texts.pop(0))
-                orca_profiler.tock(pcm)
-                buffer_pcm(pcm)
-                play_buffered_pcm()
-            orca_profiler.tick()
-            pcm = orca_stream.flush()
-            orca_profiler.tock(pcm)
-            buffer_pcm(pcm)
-            play_buffered_pcm()
-            connection.send({'rtf': orca_profiler.rtf(), 'delay': delay_sec[0]})
-            flush = False
-            speaker.flush(list(chain.from_iterable(pcm_deque)))
-            pcm_deque.clear()
-            speaker.stop()
-            delay_sec[0] = -1
-            connection.send({'done': True})
-        elif close:
-            break
-        elif interrupt:
-            orca_profiler.tick()
-            pcm = orca_stream.flush()
-            orca_profiler.tock(pcm)
-            connection.send({'rtf': orca_profiler.rtf(), 'delay': delay_sec[0]})
-            interrupt = False
-            pcm_deque.clear()
-            speaker.stop()
-            delay_sec[0] = -1
-            connection.send({'done': True})
-        else:
-            time.sleep(stream_frame_sec)
-
-        while connection.poll():
-            message = connection.recv()
-            if message['command'] == 'synthesize':
-                texts.append(message['text'])
-                if not speaker.is_started:
-                    speaker.start()
-                    warmup[0] = True
-                utterance_end_sec = message['utterance_end_sec']
-                synthesize = True
-            elif message['command'] == 'flush':
-                synthesize = False
-                flush = True
-            elif message['command'] == 'close':
-                close = True
-            elif message['command'] == 'interrupt':
-                interrupt = True
-
-    speaker.delete()
-    orca_stream.close()
-    orca.delete()
-
-
-def main() -> None:
-    parser = ArgumentParser()
-    parser.add_argument(
-        '--access_key',
-        required=True,
-        help='`AccessKey` obtained from `Picovoice Console` (https://console.picovoice.ai/).')
-    parser.add_argument(
-        '--picollm_model_path',
-        required=True,
-        help='Absolute path to the file containing LLM parameters (`.pllm`).')
-    parser.add_argument(
-        '--keyword-model_path',
-        help='Absolute path to the keyword model file (`.ppn`). If not set, `Picovoice` will be the wake phrase')
-    parser.add_argument(
-        '--cheetah_endpoint_duration_sec',
-        type=float,
-        default=1.,
-        help="Duration of silence (pause) after the user's utterance to consider it the end of the utterance.")
-    parser.add_argument(
-        '--picollm_device',
-        help="String representation of the device (e.g., CPU or GPU) to use for inference. If set to `best`, picoLLM "
-             "picks the most suitable device. If set to `gpu`, the engine uses the first available GPU device. To "
-             "select a specific GPU device, set this argument to `gpu:${GPU_INDEX}`, where `${GPU_INDEX}` is the index "
-             "of the target GPU. If set to `cpu`, the engine will run on the CPU with the default number of threads. "
-             "To specify the number of threads, set this argument to `cpu:${NUM_THREADS}`, where `${NUM_THREADS}` is "
-             "the desired number of threads.")
-    parser.add_argument(
-        '--picollm_completion_token_limit',
-        type=int,
-        default=256,
-        help="Maximum number of tokens in the completion. Set to `None` to impose no limit.")
-    parser.add_argument(
-        '--picollm_presence_penalty',
-        type=float,
-        default=0.,
-        help="It penalizes logits already appearing in the partial completion if set to a positive value. If set to "
-             "`0.0`, it has no effect.")
-    parser.add_argument(
-        '--picollm_frequency_penalty',
-        type=float,
-        default=0.,
-        help="If set to a positive floating-point value, it penalizes logits proportional to the frequency of their "
-             "appearance in the partial completion. If set to `0.0`, it has no effect.")
-    parser.add_argument(
-        '--picollm_temperature',
-        type=float,
-        default=0.,
-        help="Sampling temperature. Temperature is a non-negative floating-point value that controls the randomness of "
-             "the sampler. A higher temperature smoothens the samplers' output, increasing the randomness. In "
-             "contrast, a lower temperature creates a narrower distribution and reduces variability. Setting it to "
-             "`0` selects the maximum logit during sampling.")
-    parser.add_argument(
-        '--picollm_top_p',
-        type=float,
-        default=1.,
-        help="A positive floating-point number within (0, 1]. It restricts the sampler's choices to high-probability "
-             "logits that form the `top_p` portion of the probability mass. Hence, it avoids randomly selecting "
-             "unlikely logits. A value of `1.` enables the sampler to pick any token with non-zero probability, "
-             "turning off the feature.")
-    parser.add_argument(
-        '--orca_warmup_sec',
-        type=float,
-        default=0.,
-        help="Duration of the synthesized audio to buffer before streaming it out. A higher value helps slower "
-             "(e.g., Raspberry Pi) to keep up with real-time at the cost of increasing the initial delay.")
-    parser.add_argument('--profile', action='store_true', help='Show runtime profiling information.')
-    parser.add_argument('--short_answers', action='store_true')
-    args = parser.parse_args()
-
-    access_key = args.access_key
-    picollm_model_path = args.picollm_model_path
-    keyword_model_path = args.keyword_model_path
-    cheetah_endpoint_duration_sec = args.cheetah_endpoint_duration_sec
-    picollm_device = args.picollm_device
-    picollm_completion_token_limit = args.picollm_completion_token_limit
-    picollm_presence_penalty = args.picollm_presence_penalty
-    picollm_frequency_penalty = args.picollm_frequency_penalty
-    picollm_temperature = args.picollm_temperature
-    picollm_top_p = args.picollm_top_p
-    orca_warmup_sec = args.orca_warmup_sec
-    profile = args.profile
-    short_answers = args.short_answers
-
-    if keyword_model_path is None:
-        porcupine = pvporcupine.create(access_key=access_key, keywords=['picovoice'])
-    else:
-        porcupine = pvporcupine.create(access_key=access_key, keyword_paths=[keyword_model_path])
-    print(f"→ Porcupine v{porcupine.version}")
-
-    cheetah = pvcheetah.create(
-        access_key=access_key,
-        endpoint_duration_sec=cheetah_endpoint_duration_sec,
-        enable_automatic_punctuation=True)
-    print(f"→ Cheetah v{cheetah.version}")
-
-    pllm = picollm.create(access_key=access_key, model_path=picollm_model_path, device=picollm_device)
-    dialog = pllm.get_dialog()
-    print(f"→ picoLLM v{pllm.version} <{pllm.model}>")
-
-    main_connection, orca_process_connection = Pipe()
-    orca_process = Process(target=orca_worker, args=(access_key, orca_process_connection, orca_warmup_sec))
-    orca_process.start()
-    while not main_connection.poll():
-        time.sleep(0.01)
-    print(f"→ Orca v{main_connection.recv()['version']}")
-
-    mic = PvRecorder(frame_length=porcupine.frame_length)
-    mic.start()
-
-    print(f"\n$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...")
-
-    stop = [False]
-
-    def handler(_, __) -> None:
-        stop[0] = True
-
-    signal.signal(signal.SIGINT, handler)
-
-    def llm_task(dialog, user_request, utterance_end_sec, main_connection):
-        short_answers_instruction = \
-            "You are a voice assistant and your answers are very short but informative"
-        dialog.add_human_request(
-            f"{short_answers_instruction}. {user_request}" if short_answers else user_request)
-
-        picollm_profiler = TPSProfiler()
-
-        stop_phrases = {
-            '</s>',  # Llama-2, Mistral, and Mixtral
-            '<end_of_turn>',  # Gemma
-            '<|endoftext|>',  # Phi-2
-            '<|eot_id|>',  # Llama-3
-            '<|end|>', '<|user|>', '<|assistant|>',  # Phi-3
-        }
-
-        completion = CompletionText(stop_phrases)
-
-        def llm_callback(text: str) -> None:
-            picollm_profiler.tock()
-            completion.append(text)
-            new_tokens = completion.get_new_tokens()
-            if len(new_tokens) > 0:
-                main_connection.send({
-                    'command': 'synthesize',
-                    'text': new_tokens.replace('\n', ' . '),
-                    'utterance_end_sec': utterance_end_sec})
-                print(f'{new_tokens}', end='', flush=True)
-
-        print(
-            f"\nLLM (say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} to interrupt) > ",
-            end='',
-            flush=True)
-        res = pllm.generate(
-            prompt=dialog.prompt(),
-            completion_token_limit=picollm_completion_token_limit,
-            stop_phrases=stop_phrases,
-            presence_penalty=picollm_presence_penalty,
-            frequency_penalty=picollm_frequency_penalty,
-            temperature=picollm_temperature,
-            top_p=picollm_top_p,
-            stream_callback=llm_callback)
-
-        if res.endpoint == picollm.PicoLLMEndpoints.INTERRUPTED:
-            main_connection.send({'command': 'interrupt'})
-        else:
-            main_connection.send({'command': 'flush'})
-
-        print('\n')
-        dialog.add_llm_response(res.completion)
-
-        if profile:
-            print(f"[picoLLM TPS: {picollm_profiler.tps():.2f}]")
-
-        while not main_connection.poll():
-            time.sleep(0.01)
-        message = main_connection.recv()
-        if profile:
-            print(f"[Orca RTF: {message['rtf']:.2f}]")
-            print(f"[Delay: {message['delay']:.2f} sec]")
-        while not main_connection.poll():
-            time.sleep(0.01)
-        assert main_connection.recv()['done']
-
-        return res
-
-    wake_word_detected = False
-    user_request = ''
-    endpoint_reached = False
-
-    porcupine_profiler = RTFProfiler(porcupine.sample_rate)
-    cheetah_profiler = RTFProfiler(cheetah.sample_rate)
-
-    try:
-        while True:
-            if stop[0]:
-                break
-            elif not wake_word_detected:
-                pcm = mic.read()
-                porcupine_profiler.tick()
-                wake_word_detected = porcupine.process(pcm) == 0
-                porcupine_profiler.tock(pcm)
-                if wake_word_detected:
-                    if profile:
-                        print(f"[Porcupine RTF: {porcupine_profiler.rtf():.3f}]")
-                    print("$ Wake word detected, utter your request or question ...\n")
-                    print("User > ", end='', flush=True)
-            elif not endpoint_reached:
-                pcm = mic.read()
-                cheetah_profiler.tick()
-                partial_transcript, endpoint_reached = cheetah.process(pcm)
-                cheetah_profiler.tock(pcm)
-                print(partial_transcript, end='', flush=True)
-                user_request += partial_transcript
-                if endpoint_reached:
-                    utterance_end_sec = time.perf_counter()
-                    cheetah_profiler.tick()
-                    remaining_transcript = cheetah.flush()
-                    cheetah_profiler.tock()
-                    user_request += remaining_transcript
-                    print(remaining_transcript, end='\n')
-                    if profile:
-                        print(f"[Cheetah RTF: {cheetah_profiler.rtf():.3f}]")
-                    with concurrent.futures.ThreadPoolExecutor() as executor:
-                        llm_future = executor.submit(
-                            llm_task,
-                            dialog,
-                            user_request,
-                            utterance_end_sec,
-                            main_connection)
-
-                        while not llm_future.done():
-                            pcm = mic.read()
-                            porcupine_profiler.tick()
-                            wake_word_detected = porcupine.process(pcm) == 0
-                            porcupine_profiler.tock(pcm)
-                            if wake_word_detected:
-                                pllm.interrupt()
-                                break
-
-                        llm_result = llm_future.result()
-                        if llm_result.endpoint == picollm.PicoLLMEndpoints.INTERRUPTED:
-                            wake_word_detected = True
-                            print("$ Wake word detected, utter your request or question ...\n")
-                            print("User > ", end='', flush=True)
-                        else:
-                            wake_word_detected = False
-                            print(f"$ Say {'`Picovoice`' if keyword_model_path is None else 'the wake word'} ...")
-                        user_request = ''
-                        endpoint_reached = False
-
-    finally:
-        main_connection.send({'command': 'close'})
-        mic.delete()
-        pllm.release()
-        cheetah.delete()
-        porcupine.delete()
-        orca_process.join()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/recipes/llm-voice-assistant/python/windows_gui/README.md b/recipes/llm-voice-assistant/python/windows_gui/README.md
new file mode 100644
index 0000000..d01bc46
--- /dev/null
+++ b/recipes/llm-voice-assistant/python/windows_gui/README.md
@@ -0,0 +1,60 @@
+# Windows LLM Voice Assistant GUI Demo
+
+A voice assistant for Windows using Picovoice's Wake Word, STT, TTS and LLM technology with a console-based graphical interface.
+
+## Compatibility
+
+- Python 3.8+
+- Runs on Windows (x86_64).
+
+## AccessKey
+
+AccessKey is your authentication and authorization token for deploying Picovoice SDKs, including picoLLM. Anyone who is
+using Picovoice needs to have a valid AccessKey. You must keep your AccessKey secret. You would need internet
+connectivity to validate your AccessKey with Picovoice license servers even though the LLM inference is running 100%
+offline and completely free for open-weight models. Everyone who signs up for
+[Picovoice Console](https://console.picovoice.ai/) receives a unique AccessKey.
+
+## picoLLM Model
+
+picoLLM Inference Engine supports many open-weight models. The models are on
+[Picovoice Console](https://console.picovoice.ai/).
+
+## Usage
+
+Install the required packages:
+
+```console
+pip install -r requirements.txt
+```
+
+Run the demo:
+
+```console
+python3 main.py --access_key ${ACCESS_KEY} --picollm_model_path ${PICOLLM_MODEL_PATH} 
+```
+
+Replace `${ACCESS_KEY}` with yours obtained from Picovoice Console and `${PICOLLM_MODEL_PATH}` with the path to the 
+model downloaded from Picovoice Console.
+
+To see all available options, type the following:
+
+```console
+python main.py --help
+```
+
+## Config File
+
+In addition to command line arguments a config file can be used to pass arguments to the demo. By default the demo looks for `config.json` in the same directory as `main.py` but an alternative path can be passed using the `--config` option. Below is an example config file.
+
+```json
+{
+    "access_key": "${ACCESS_KEY}",
+    "picollm_model_path": "${PICOLLM_MODEL_PATH}"
+}
+```
+
+## Custom Wake Word
+
+The demo's default wake phrase is `Jarvis`. You can generate your custom (branded) wake word using Picovoice  Console by following [Porcupine Wake Word documentation (https://picovoice.ai/docs/porcupine/). Once you have the model trained, simply pass it to the demo
+application using `--keyword_model_path` argument.
\ No newline at end of file
diff --git a/recipes/llm-voice-assistant/python/windows_gui.py b/recipes/llm-voice-assistant/python/windows_gui/main.py
similarity index 100%
rename from recipes/llm-voice-assistant/python/windows_gui.py
rename to recipes/llm-voice-assistant/python/windows_gui/main.py
diff --git a/recipes/llm-voice-assistant/python/requirements.txt b/recipes/llm-voice-assistant/python/windows_gui/requirements.txt
similarity index 90%
rename from recipes/llm-voice-assistant/python/requirements.txt
rename to recipes/llm-voice-assistant/python/windows_gui/requirements.txt
index 3c73f69..2de6e31 100644
--- a/recipes/llm-voice-assistant/python/requirements.txt
+++ b/recipes/llm-voice-assistant/python/windows_gui/requirements.txt
@@ -1,5 +1,5 @@
 picollm==1.2.3
-pvcheetah==2.0.1
+pvcheetah==2.1.0
 pvorca==1.0.0
 pvporcupine==3.0.2
 pvrecorder==1.2.2
diff --git a/res/.lint/spell-check/dict.txt b/res/.lint/spell-check/dict.txt
index 8bab457..0017b8b 100644
--- a/res/.lint/spell-check/dict.txt
+++ b/res/.lint/spell-check/dict.txt
@@ -25,6 +25,7 @@ picovoice
 pids
 playstate
 pllm
+popleft
 psutil
 pvcheetah
 pvorca