-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdesktopV3.py
225 lines (189 loc) · 8.11 KB
/
desktopV3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import cv2
import requests
import openai
import io
import asyncio
import threading
import wave
import pyaudio
from pydub import AudioSegment
from pydub.playback import play
from dotenv import load_dotenv
from elevenlabs import set_api_key
from deepgram import Deepgram
import aiohttp
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ELEVEN_API_KEY = "1da2bb9407e638f4445d437c8e8770e1"
def play_audio(audio_bytes):
try:
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
play(audio)
except Exception as e:
print(f"Failed to play audio: {e}")
# Function to transcribe audio via Deepgram
async def transcribe_audio(audio_bytes, DEEPGRAM_API_KEY='5a1672a5d83a9b723a45375eb0077e5c464a8909'):
deepgram = Deepgram(DEEPGRAM_API_KEY)
source = {
'buffer': audio_bytes,
'mimetype': 'audio/mp3'
}
async with aiohttp.ClientSession() as session:
response = await asyncio.create_task(
deepgram.transcription.prerecorded(
source,
{
'smart_format': True,
'model': 'nova',
},
session=session # passing the session here
)
)
return response["results"]["channels"][0]["alternatives"][0]["transcript"]
openai.api_key = OPENAI_API_KEY
# Function to get response from OpenAI API
def call_openai_api(prompt: str, scene_info: list = []):
system_setting = f"""Assume you are helping a blind person as much as possible. Assume they want to move forward so suggest what cardinal direction to avoid or to continue. You
can compare what objects are in front. An example could be 'there is a street in front of you with no
cross sign on yet. Proceed with caution'. Another example is 'there's a small animal front, be mindful
below your waist'. Emphasize things in the scene where the person can either engage with or needs to avoid. For example,
should that person walk forward? Provide a clear, concise, and contextually relevant. Emphasize the most engagable aspects of the image.
Please talk like a helpful assistant with great voice intonations. Keep your sentences maximum three. Here's some context in the current visual scene:
objects and their counts: {scene_info[0]}
scene description: {scene_info[1]}"""
messages = [ {"role": "system", "content": system_setting}, {"role": "user", "content": prompt} ]
headers = {
'Authorization': f'Bearer {OPENAI_API_KEY}',
'Content-Type': 'application/json',
}
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo", messages=messages, temperature=0.7
)
# reply = chat.choices[0].message.content
message = response['choices'][0]['text'].strip()
return message
# def get_openai_response(prompt):
# try:
# response = openai.Completion.create(
# model="gpt-3.5-turbo-instruct",
# prompt=prompt,
# max_tokens=50,
# temperature=0.7
# )
# message = response['choices'][0]['text'].strip()
# return message
# except Exception as e:
# return str(e)
set_api_key("1da2bb9407e638f4445d437c8e8770e1")
# Function to get text to speech from ElevenLabs
def get_tts(text):
tts_url = "https://api.elevenlabs.io/v1/text-to-speech/D38z5RcWu1voky8WS1ja"
tts_headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": ELEVEN_API_KEY
}
tts_data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5
}
}
try:
tts_response = requests.post(tts_url, json=tts_data, headers=tts_headers)
tts_response.raise_for_status()
return tts_response.content
except requests.RequestException as e:
print(f"Failed to communicate with ElevenLabs API: {e}")
return None
# Global variables
is_recording = False
audio_frames = []
# Function to record audio until a global flag is set
def record_audio():
global is_recording
global audio_frames
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)
is_recording = True
audio_frames = []
while is_recording:
data = stream.read(1024)
audio_frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
# Initialize the webcam
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
cv2.imshow('Press space to capture, and Esc to exit', frame)
key = cv2.waitKey(1)
if key == 27: # Esc key
break
elif key == 32: # Spacebar
_, buffer = cv2.imencode('.jpg', frame)
byte_img = buffer.tobytes()
image_files = {'file': ('image.jpg', byte_img, 'image/jpeg')}
try:
# Send image to the relevant API
image_response = requests.post('https://3942-64-247-206-81.ngrok-free.app/uploadfile', files=image_files)
# yolo_response = requests.post('http://th7mydolfy.loclx.io/uploadfile', files=image_files)
image_response.raise_for_status()
response_json = image_response.json()
print("IMAGE RESPONSE JSON", response_json)
# yolo_response_json = yolo_response.json()
# print("YOLO RESPONSE JSON", yolo_response_json)
description = response_json.get('text', "No description provided by the API")
print(f"Description from image to text model: {description}")
# Convert description to speech via ElevenLabs API
audio_bytes = get_tts(description)
if audio_bytes:
play_audio(audio_bytes)
# Transcribe audio via Deepgram
transcription = asyncio.run(transcribe_audio(audio_bytes))
if transcription:
print(f"============")
print(f"Environment description: {transcription}")
print(f"============")
print(f">>>")
# Get response from OpenAI API
openai_response = call_openai_api(transcription, ["", response_json["text"]])
# openai_response = call_openai_api(transcription, [yolo_response_json["text"], response_json["text"]])
#openai_response = get_openai_response(transcription)
#print(f"OpenAI GPT response: {openai_response}")
# Start recording thread
record_thread = threading.Thread(target=record_audio)
record_thread.start()
# Instruction for the user
print("Press the spacebar again to stop recording...")
# Wait for user to press a key to stop recording
while True:
if cv2.waitKey(1) == 32: # Spacebar
is_recording = False
record_thread.join() # Ensure the recording thread finishes
break
# Save the recorded audio to a .wav file
wf = wave.open('recorded_audio.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
wf.setframerate(44100)
wf.writeframes(b''.join(audio_frames))
wf.close()
# Assume the transcription API accepts a file. Adjust if needed.
with open('recorded_audio.wav', 'rb') as audio_file:
transcription = asyncio.run(transcribe_audio(audio_file.read()))
if transcription:
print(f"============")
print(f"Transcription of user: {transcription}")
print(f"============")
# Additional handling/logic as needed...
except requests.RequestException as e:
print("Failed to communicate with the description API.")
print(e)
cap.release()
cv2.destroyAllWindows()