-
Notifications
You must be signed in to change notification settings - Fork 0
/
voice_stt_mode.py
509 lines (444 loc) · 17.8 KB
/
voice_stt_mode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
"""
Airtime and Messaging Service using Africa's Talking API
This script provides a Gradio-based web interface for sending airtime and messages
using the Africa's Talking API. It also tracks the carbon emissions of the operations
using the CodeCarbon library.
The voice command interface allows users to send airtime, send messages, and search for
news articles using voice commands. However, the audio transcription and processing
is required since the model only accepts text inputs & has limited capabilities.
Usage:
1. Set the environment variables `AT_USERNAME`, `GROQ_API_KEY`, and `AT_API_KEY` with
your Africa's Talking credentials.
2. Run the script: `python app.py`
3. Access the Gradio web interface to send airtime or messages or search for news articles.
Example:
Send airtime to a phone number:
- `Send airtime to +254712345678 with an amount of 10 in currency KES`
Send a message to a phone number:
- `Send a message to +254712345678 with the message 'Hello there',
using the username 'username'`
Search for news about a topic:
- `Latest news on climate change`
"""
# ------------------------------------------------------------------------------------
# Import Statements
# ------------------------------------------------------------------------------------
# Standard Library Imports
import os
import io
import json
import logging
from logging.handlers import RotatingFileHandler
import asyncio
from importlib.metadata import version, PackageNotFoundError
# Third-Party Library Imports
import gradio as gr
from langtrace_python_sdk import langtrace, with_langtrace_root_span
import groq
import numpy as np
import soundfile as sf
import ollama
# Local Module Imports
from utils.function_call import send_airtime, send_message, search_news
# ------------------------------------------------------------------------------------
# Logging Configuration
# ------------------------------------------------------------------------------------
# Initialize Langtrace
langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY"))
groq_client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) # Set the logger to handle all levels DEBUG and above
# Prevent logs from being propagated to the root logger
logger.propagate = False
# Define logging format
formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s")
# Set up the file handler for logging to a file
file_handler = RotatingFileHandler(
"voice_stt_mode.log", maxBytes=5 * 1024 * 1024, backupCount=5
)
file_handler.setLevel(logging.INFO) # Capture INFO and above in the file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Set up the stream handler for console output
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG) # Capture DEBUG and above in the console
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
# ------------------------------------------------------------------------------------
# Log the Start of the Script
# ------------------------------------------------------------------------------------
logger.info(
"Starting the voice&text function calling script to send airtime and messages using the "
"Africa's Talking API"
)
logger.info("Review GROQ Speech-to-Text if they log the audio data or not")
logger.info("Let's review the packages and their versions")
# ------------------------------------------------------------------------------------
# Log Versions of the Libraries
# ------------------------------------------------------------------------------------
pkgs = [
"africastalking",
"ollama",
"duckduckgo_search",
"langtrace-python-sdk",
"gradio",
"groq",
"soundfile",
"numpy",
]
for pkg in pkgs:
try:
pkg_version = version(pkg)
logger.info("%s version: %s", pkg, pkg_version)
except PackageNotFoundError:
logger.error("Package %s is not installed.", pkg)
except Exception as e:
logger.error("Failed to retrieve version for %s: %s", pkg, str(e))
# ------------------------------------------------------------------------------------
# Define Tools Schema
# ------------------------------------------------------------------------------------
tools = [
{
"type": "function",
"function": {
"name": "send_airtime",
"description": "Send airtime to a phone number using the Africa's Talking API",
"parameters": {
"type": "object",
"properties": {
"phone_number": {
"type": "string",
"description": "The phone number in international format",
},
"currency_code": {
"type": "string",
"description": "The 3-letter ISO currency code",
},
"amount": {
"type": "string",
"description": "The amount of airtime to send",
},
},
"required": ["phone_number", "currency_code", "amount"],
},
},
},
{
"type": "function",
"function": {
"name": "send_message",
"description": "Send a message to a phone number using the Africa's Talking API",
"parameters": {
"type": "object",
"properties": {
"phone_number": {
"type": "string",
"description": "The phone number in international format",
},
"message": {
"type": "string",
"description": "The message to send",
},
"username": {
"type": "string",
"description": "The username for the Africa's Talking account",
},
},
"required": ["phone_number", "message", "username"],
},
},
},
{
"type": "function",
"function": {
"name": "search_news",
"description": "Search for news articles using DuckDuckGo News API",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query for news articles",
},
"max_results": {
"type": "integer",
"description": "The maximum number of news articles to retrieve",
"default": 5,
},
},
"required": ["query"],
},
},
},
]
# ------------------------------------------------------------------------------------
# Function Definitions
# ------------------------------------------------------------------------------------
@with_langtrace_root_span()
async def process_user_message(message: str, history: list) -> str:
"""
Handle the conversation with the model asynchronously.
Parameters
----------
message : str
The user's input message.
history : list of list of str
The conversation history up to that point.
Returns
-------
str
The model's response or the function execution result.
"""
logger.info("Processing user message: %s", message)
client = ollama.AsyncClient()
messages = [
{
"role": "user",
"content": message,
}
]
try:
response = await client.chat(
model="qwen2.5:0.5b",
messages=messages,
tools=tools,
)
except Exception as e:
logger.exception("Failed to get response from Ollama client.")
return "An unexpected error occurred while communicating with the assistant."
model_message = response.get("message", {})
model_content = model_message.get("content", "")
model_role = model_message.get("role", "assistant")
logger.info("Model response: %s", model_content)
messages.append(
{
"role": model_role,
"content": model_content,
}
)
logger.debug("Model response details: %s", response.get("message"))
if model_message.get("tool_calls"):
for tool in model_message["tool_calls"]:
tool_name = tool["function"]["name"]
arguments = tool["function"]["arguments"]
logger.info("Tool call detected: %s", tool_name)
try:
if tool_name == "send_airtime":
logger.info("Calling send_airtime with arguments: %s", arguments)
function_response = send_airtime(
arguments["phone_number"],
arguments["currency_code"],
arguments["amount"],
)
elif tool_name == "send_message":
logger.info("Calling send_message with arguments: %s", arguments)
function_response = send_message(
arguments["phone_number"],
arguments["message"],
arguments["username"],
)
elif tool_name == "search_news":
logger.info("Calling search_news with arguments: %s", arguments)
function_response = search_news(arguments["query"])
else:
function_response = json.dumps({"error": "Unknown function"})
logger.warning("Unknown function called: %s", tool_name)
logger.debug("Function response: %s", function_response)
messages.append(
{
"role": "tool",
"content": function_response,
}
)
return f"Function `{tool_name}` executed successfully. Response:\n{function_response}"
except (
send_airtime.ErrorType,
send_message.ErrorType,
search_news.ErrorType,
) as e:
logger.error("Handled error in tool `%s`: %s", tool_name, e)
return f"Error executing `{tool_name}`: {str(e)}"
except Exception as e: # pylint: disable=broad-exception-caught
logger.exception("Unexpected error in tool `%s`: %s", tool_name, e)
return f"An unexpected error occurred while executing `{tool_name}`."
else:
return model_content
# Add error handling for audio processing
async def process_audio_and_llm(audio):
"""
Process the audio recording and get the transcription using Groq.
Parameters
----------
audio : tuple
The audio recording tuple with the sample rate and audio data.
Returns
-------
str
The transcription and LLM response.
Raises
------
Exception
If there is an error in processing the audio.
"""
if audio is None:
return "Error: No audio recorded. Please try again."
try:
sr, y = audio
if len(y) == 0:
return "Error: Empty audio recording. Please speak and try again."
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Normalize audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Write audio to buffer
buffer = io.BytesIO()
sf.write(buffer, y, sr, format="wav")
buffer.seek(0)
try:
# Get transcription from Groq
# add the import here then text will be cut out for the client
transcription = groq_client.audio.transcriptions.create(
model="distil-whisper-large-v3-en",
file=("audio.wav", buffer),
response_format="text",
)
# Process transcription with LLM
response = await process_user_message(transcription, [])
return f"Transcription: {transcription}\nLLM Response: {response}"
except Exception as e:
logger.exception("Error during transcription or LLM processing: %s", e)
return f"Error: {str(e)}"
except Exception as e:
logger.exception("Error in audio processing: %s", e)
return f"Error: {str(e)}"
def gradio_interface(message: str, history: list) -> str:
"""
Gradio interface function to process user messages and track emissions.
Parameters
----------
message : str
The user's input message.
history : list of list of str
The conversation history up to that point.
Returns
-------
str
The model's response or the function execution result.
"""
try:
response = asyncio.run(process_user_message(message, history))
return response
except Exception as e: # pylint: disable=broad-exception-caught
logger.exception("Error in gradio_interface: %s", e)
return "An unexpected error occurred while processing your message."
# ------------------------------------------------------------------------------------
# Create Gradio Interface with Both Text and Audio Inputs
# ------------------------------------------------------------------------------------
with gr.Blocks(title="🎙️ Voice Command Communication Interface 🌍") as demo:
gr.Markdown("# Voice Command & Text Communication Interface")
# Add tabs for voice and text input
with gr.Tab("Voice Input"):
# How to use
gr.Markdown(
"""
This interface allows you to send airtime, messages, and search
for news articles using voice commands.
You can also type your commands in the text input tab.
Here are some examples of commands you can use:
- Send airtime to +254712345678 with an amount of 10 in currency KES 📞
- Send a message to +254712345678 with the message 'Hello there' with
the username 'add your username'💬
- Search news for 'latest technology trends' 📰
* Please speak clearly and concisely for accurate transcription. In English only for now.
* You can also edit the transcription before processing. We all make mistakes! 🤗
"""
)
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Speak your command",
streaming=False,
)
transcription_preview = gr.Textbox(
label="Preview Transcription (Edit if needed)",
interactive=True,
placeholder="Transcription will appear here first...",
)
audio_output = gr.Textbox(
label="Final Result", placeholder="LLM response will appear here..."
)
with gr.Row():
transcribe_button = gr.Button("Transcribe")
process_button = gr.Button("Process Edited Text", variant="primary")
def show_transcription(audio):
"""
Transcribe the audio recording and show the preview.
Parameters
----------
audio : tuple
The audio recording tuple with the sample rate and audio data.
Returns
-------
str
The transcription of the audio recording.
"""
try:
if audio is None:
return "Error: No audio recorded. Please try again."
sr, y = audio
if len(y) == 0:
return "Error: Empty audio recording. Please speak and try again."
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Normalize audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Write audio to buffer
buffer = io.BytesIO()
sf.write(buffer, y, sr, format="wav")
buffer.seek(0)
# Get transcription from Groq
transcription = groq_client.audio.transcriptions.create(
model="distil-whisper-large-v3-en",
file=("audio.wav", buffer),
response_format="text",
)
logger.info("Audio transcribed successfully: %s", transcription)
return transcription
except Exception as e:
logger.exception("Error during transcription: %s", e)
return f"Error: {str(e)}"
# Wire up the components
transcribe_button.click(
fn=show_transcription, inputs=audio_input, outputs=transcription_preview
)
# Process the edited text
process_button.click(
fn=lambda x: asyncio.run(process_user_message(x, [])),
inputs=transcription_preview,
outputs=audio_output,
)
# Text input tab
with gr.Tab("Text Input"):
chat_interface = gr.ChatInterface(
fn=gradio_interface,
description=(
"Type your commands or use voice input above:\n"
"- Send airtime to +254712345678 with an amount of 10 in currency KES 📞\n"
"- Send a message to +254712345678 with the message 'Hello there' 💬\n"
"- Search news for 'latest technology trends' 📰"
),
type="messages",
)
if __name__ == "__main__":
try:
logger.info("Launching Gradio interface...")
demo.launch(inbrowser=True, server_name="0.0.0.0", server_port=7860)
logger.info("Gradio interface launched successfully.")
except Exception as e:
logger.exception("Failed to launch Gradio interface: %s", e)
logger.info("Script execution completed")