-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudio_recorder.py
259 lines (217 loc) · 10 KB
/
audio_recorder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# -*- coding: utf-8 -*-
"""
Filename: audio_recorder.py
Author: Iliya Vereshchagin
Copyright (c) 2023. All rights reserved.
Created: 25.08.2023
Last Modified: 17.10.2023
Description:
This file contains implementation for Audio Recorder
"""
import os
import struct
import tempfile
import time
import uuid
import wave
import math
import pyaudio
import sounddevice as sd
import soundfile as sf
from pydub import AudioSegment
def record_and_convert_audio(duration: int = 5, frequency_sample: int = 16000):
"""
Records audio for a specified duration and converts it to MP3 format.
This function records audio for a given duration (in seconds) with a specified frequency sample.
The audio is then saved as a temporary .wav file, converted to .mp3 format, and the .wav file is deleted.
The function returns the path to the .mp3 file.
:param duration: (int) The duration of the audio recording in seconds. Default is 5 seconds.
:param frequency_sample: (int) The frequency sample rate of the audio recording. Default is 16000 Hz.
:return: The path to the saved .mp3 file.
"""
print(f"Listening beginning for {duration}s...")
recording = sd.rec(int(duration * frequency_sample), samplerate=frequency_sample, channels=1)
sd.wait() # Wait until recording is finished
print("Recording complete!")
temp_dir = tempfile.gettempdir()
wave_file = f"{temp_dir}/{str(uuid.uuid4())}.wav"
sf.write(wave_file, recording, frequency_sample)
print(f"Temp audiofile saved: {wave_file}")
audio = AudioSegment.from_wav(wave_file)
os.remove(wave_file)
mp3_file = f"{temp_dir}/{str(uuid.uuid4())}.mp3"
audio.export(mp3_file, format="mp3")
print(f"Audio converted to MP3 and stored into {mp3_file}")
return mp3_file
# pylint: disable=too-many-instance-attributes
class AudioRecorder:
"""
The AudioRecorder class is for managing an instance of the audio recording and conversion process.
Parameters:
pyaudio_obj (PyAudio): Instance of PyAudio. Default is pyaudio.PyAudio().
threshold (int): The RMS threshold for starting the recording. Default is 15.
channels (int): The number of channels in the audio stream. Default is 1.
chunk (int): The number of frames per buffer. Default is 1024.
f_format (int): The format of the audio stream. Default is pyaudio.paInt16.
rate (int): The sample rate of the audio stream. Default is 16000 Hz.
sample_width (int): The sample width (in bytes) of the audio stream. Default is 2.
timeout_length (int): The length of the timeout for the recording (in seconds). Default is 2 seconds.
temp_dir (str): The directory for storing the temporary .wav and .mp3 files. Default is the system's temporary dir.
normalize (float): The normalization factor for the audio samples. Default is 1.0 / 32768.0.
pa_input (bool): Specifies whether the stream is an input stream. Default is True.
pa_output (bool): Specifies whether the stream is an output stream. Default is True.
"""
def __init__(
self,
pyaudio_obj=pyaudio.PyAudio(),
threshold=15,
channels=1,
chunk=1024,
f_format=pyaudio.paInt16,
rate=16000,
sample_width=2,
timeout_length=2,
temp_dir=tempfile.gettempdir(),
normalize=(1.0 / 32768.0),
pa_input=True,
pa_output=True,
):
"""
General init.
This method initializes an instance of the AudioRecorder class with the specified parameters.
The default values are used for any parameters that are not provided.
:param pyaudio_obj: Instance of PyAudio. Default is pyaudio.PyAudio().
:param threshold: The RMS threshold for starting the recording. Default is 15.
:param channels: The number of channels in the audio stream. Default is 1.
:param chunk: The number of frames per buffer. Default is 1024.
:param f_format: The format of the audio stream. Default is pyaudio.paInt16.
:param rate: The sample rate of the audio stream. Default is 16000 Hz.
:param sample_width: The sample width (in bytes) of the audio stream. Default is 2.
:param timeout_length: The length of the timeout for the recording (in seconds). Default is 2 seconds.
:param temp_dir: The directory for storing the temporary .wav and .mp3 files. Default is temp dir.
:param normalize: The normalization factor for the audio samples. Default is 1.0 / 32768.0.
:param pa_input: Specifies whether the stream is an input stream. Default is True.
:param pa_output: Specifies whether the stream is an output stream. Default is True.
"""
self.___pyaudio = pyaudio_obj
self.___threshold = threshold
self.___channels = channels
self.___chunk = chunk
self.___format = f_format
self.___rate = rate
self.___sample_width = sample_width
self.___timeout_length = timeout_length
self.___temp_dir = temp_dir
self.___normalize = normalize
self.___input = pa_input
self.___output = pa_output
self.stream = self.init_stream(
f_format=self.___format,
channels=self.___channels,
rate=self.___rate,
pa_input=self.___input,
pa_output=self.___output,
frames_per_buffer=self.___chunk,
)
def init_stream(self, f_format, channels, rate, pa_input, pa_output, frames_per_buffer):
"""
Initializes an audio stream with the specified parameters.
This function uses PyAudio to open an audio stream with the given format, channels, rate, input, output,
and frames per buffer.
:param f_format: The format of the audio stream.
:param channels: The number of channels in the audio stream.
:param rate: The sample rate of the audio stream.
:param pa_input: Specifies whether the stream is an input stream. A true value indicates an input stream.
:param pa_output: Specifies whether the stream is an output stream. A true value indicates an output stream.
:param frames_per_buffer: The number of frames per buffer.
:type frames_per_buffer: int
:return: The initialized audio stream.
"""
return self.___pyaudio.open(
format=f_format,
channels=channels,
rate=rate,
input=pa_input,
output=pa_output,
frames_per_buffer=frames_per_buffer,
)
def record(self):
"""
Starts recording audio when noise is detected.
This function starts recording audio when noise above a certain threshold is detected.
The recording continues for a specified timeout length.
The recorded audio is then saved as a .wav file, converted to .mp3 format, and the .wav file is deleted.
The function returns the path to the .mp3 file.
:return: The path to the saved .mp3 file.
"""
print("Noise detected, recording beginning")
rec = []
current = time.time()
end = time.time() + self.___timeout_length
while current <= end:
data = self.stream.read(self.___chunk)
if self.rms(data) >= self.___threshold:
end = time.time() + self.___timeout_length
current = time.time()
rec.append(data)
filename = self.write(b"".join(rec))
return self.convert_to_mp3(filename)
def write(self, recording):
"""
Saves the recorded audio to a .wav file.
This function saves the recorded audio to a .wav file with a unique filename.
The .wav file is saved in the specified temporary directory.
:param recording: The recorded audio data.
:return: The path to the saved .wav file.
"""
filename = os.path.join(self.___temp_dir, f"{str(uuid.uuid4())}.wav")
wave_form = wave.open(filename, "wb")
wave_form.setnchannels(self.___channels)
wave_form.setsampwidth(self.___pyaudio.get_sample_size(self.___format))
wave_form.setframerate(self.___rate)
wave_form.writeframes(recording)
wave_form.close()
return filename
def convert_to_mp3(self, filename):
"""
Converts a .wav file to .mp3 format.
This function converts a .wav file to .mp3 format. The .wav file is deleted after the conversion.
The .mp3 file is saved with a unique filename in the specified temporary directory.
:param filename: The path to the .wav file to be converted.
:return: The path to the saved .mp3 file.
"""
audio = AudioSegment.from_wav(filename)
mp3_file_path = os.path.join(self.___temp_dir, f"{str(uuid.uuid4())}.mp3")
audio.export(mp3_file_path, format="mp3")
os.remove(filename)
return mp3_file_path
def listen(self):
"""
Starts listening for audio.
This function continuously listens for audio and starts recording when the
RMS value of the audio exceeds a certain threshold.
:return: The path to the saved .mp3 file if recording was triggered.
"""
print("Listening beginning...")
while True:
mic_input = self.stream.read(self.___chunk)
rms_val = self.rms(mic_input)
if rms_val > self.___threshold:
return self.record()
def rms(self, frame):
"""
Calculates the Root Mean Square (RMS) value of the audio frame.
This function calculates the RMS value of the audio frame, which is a measure of the power in the audio signal.
:param frame: The audio frame for which to calculate the RMS value.
:return: The RMS value of the audio frame.
"""
count = len(frame) / self.___sample_width
# pylint: disable=C0209
f_format = "%dh" % count
shorts = struct.unpack(f_format, frame)
sum_squares = 0.0
for sample in shorts:
normal_sample = sample * self.___normalize
sum_squares += normal_sample * normal_sample
rms = math.pow(sum_squares / count, 0.5)
return rms * 1000