fix AVS audio streaming

renekliment · renekliment · commit 5ce8b0e5e4ce · 2017-03-25T09:58:01.000+01:00
* audio is now recorded separately in real-time into a queue and requests get data from this queue instead of requests actually controlling the recording
diff --git a/src/alexapi/capture.py b/src/alexapi/capture.py
@@ -2,6 +2,7 @@
 import time
 import threading
 import os
+import queue
 
 import webrtcvad
 
@@ -57,6 +58,9 @@ class Capture(object):
 	_handle_chunk_size = None
 
 	_device_info = None
+	_stream = None
+	_callback_data = None
+	_queue = None
 	_vad = None
 	_config = None
 	_tmp_path = None
@@ -69,6 +73,7 @@ def __init__(self, config, tmp_path):
 		self._tmp_path = tmp_path
 
 		self._pa = pyaudio.PyAudio()
+		self._queue = queue.Queue()
 		self._device_info = DeviceInfo()
 
 		self._recording_lock_inverted = threading.Event()
@@ -116,100 +121,113 @@ def handle_read(self):
 	def handle_release(self):
 		self._handle.close()
 
-	def silence_listener(self, throwaway_frames=None, force_record=None):
+	def _callback(self, in_data, frame_count, time_info, status):  # pylint: disable=unused-argument
 
-		self._recording_lock_inverted.clear()
+		debug = logging.getLogger('alexapi').getEffectiveLevel() == logging.DEBUG
 
-		throwaway_frames = throwaway_frames or self.VAD_THROWAWAY_FRAMES
+		if not in_data:
+			self._queue.put(False)
+			return None, pyaudio.paAbort
 
-		logger.debug("Setting up recording")
+		do_VAD = True
+		if self._callback_data['force_record'] and not self._callback_data['force_record'][1]:
+			do_VAD = False
 
-		stream = self._pa.open(
-			input=True,
-			input_device_index=self._device_info.get_device_index(self._config['sound']['input_device']),
-			format=pyaudio.paInt16,
-			channels=1,
-			rate=self.VAD_SAMPLERATE,
-			frames_per_buffer=self.VAD_PERIOD
-		)
+		# do not count first 10 frames when doing VAD
+		if do_VAD and (self._callback_data['frames'] < self._callback_data['throwaway_frames']):
+			self._callback_data['frames'] += 1
 
-		debug = logging.getLogger('alexapi').getEffectiveLevel() == logging.DEBUG
+		# now do VAD
+		elif (self._callback_data['force_record'] and self._callback_data['force_record'][0]()) \
+				or (do_VAD and (self._callback_data['thresholdSilenceMet'] is False)
+					and ((time.time() - self._callback_data['start']) < self.MAX_RECORDING_LENGTH)):
 
-		logger.debug("Start recording")
+			if do_VAD:
 
-		if self._state_callback:
-			self._state_callback()
+				if int(len(in_data) / 2) == self.VAD_PERIOD:
+					isSpeech = self._vad.is_speech(in_data, self.VAD_SAMPLERATE)
 
-		def _listen():
-			start = time.time()
+					if not isSpeech:
+						self._callback_data['silenceRun'] += 1
+					else:
+						self._callback_data['silenceRun'] = 0
+						self._callback_data['numSilenceRuns'] += 1
 
-			do_VAD = True
-			if force_record and not force_record[1]:
-				do_VAD = False
+				# only count silence runs after the first one
+				# (allow user to speak for total of max recording length if they haven't said anything yet)
+				if (self._callback_data['numSilenceRuns'] != 0) \
+						and ((self._callback_data['silenceRun'] * self.VAD_FRAME_MS) > self.VAD_SILENCE_TIMEOUT):
+					self._callback_data['thresholdSilenceMet'] = True
 
-			# Buffer as long as we haven't heard enough silence or the total size is within max size
-			thresholdSilenceMet = False
-			frames = 0
-			numSilenceRuns = 0
-			silenceRun = 0
+		else:
+			self._queue.put(False)
+			return None, pyaudio.paComplete
 
-			if debug:
-				audio = b''
+		self._queue.put(in_data)
+		if debug:
+			self._callback_data['audio'] += in_data
 
-			if do_VAD:
-				# do not count first 10 frames when doing VAD
-				while frames < throwaway_frames:
+		return None, pyaudio.paContinue
 
-					if self._interrupt:
-						break
+	def silence_listener(self, throwaway_frames=None, force_record=None):
 
-					data = stream.read(self.VAD_PERIOD, exception_on_overflow=self._pa_exception_on_overflow)
-					frames += 1
-					if data:
-						yield data
+		logger.debug("Recording: Setting up")
 
-						if debug:
-							audio += data
+		self._recording_lock_inverted.clear()
 
-			# now do VAD
-			while (force_record and force_record[0]()) \
-					or (do_VAD and (thresholdSilenceMet is False) and ((time.time() - start) < self.MAX_RECORDING_LENGTH)):
+		debug = logging.getLogger('alexapi').getEffectiveLevel() == logging.DEBUG
 
-				if self._interrupt:
-					break
+		if self._state_callback:
+			self._state_callback()
 
-				data = stream.read(self.VAD_PERIOD, exception_on_overflow=self._pa_exception_on_overflow)
-				if data:
-					yield data
+		self._queue.queue.clear()
 
-					if debug:
-						audio += data
+		self._callback_data = {
+			'start': time.time(),
+			'thresholdSilenceMet': False,  # Buffer as long as we haven't heard enough silence or the total size is within max size
+			'frames': 0,
+			'throwaway_frames': throwaway_frames or self.VAD_THROWAWAY_FRAMES,
+			'numSilenceRuns': 0,
+			'silenceRun': 0,
+			'force_record': force_record,
+			'audio': b'' if debug else False,
+		}
 
-					if do_VAD and (int(len(data)/2) == self.VAD_PERIOD):
-						isSpeech = self._vad.is_speech(data, self.VAD_SAMPLERATE)
+		stream = self._pa.open(
+			input=True,
+			input_device_index=self._device_info.get_device_index(self._config['sound']['input_device']),
+			format=pyaudio.paInt16,
+			channels=1,
+			rate=self.VAD_SAMPLERATE,
+			frames_per_buffer=self.VAD_PERIOD,
+			stream_callback=self._callback,
+			start=False
+		)
 
-						if not isSpeech:
-							silenceRun += 1
-						else:
-							silenceRun = 0
-							numSilenceRuns += 1
+		logger.debug("Recording: Start")
+		stream.start_stream()
 
-				if do_VAD:
-					# only count silence runs after the first one
-					# (allow user to speak for total of max recording length if they haven't said anything yet)
-					if (numSilenceRuns != 0) and ((silenceRun * self.VAD_FRAME_MS) > self.VAD_SILENCE_TIMEOUT):
-						thresholdSilenceMet = True
+		def _listen():
+			while True:
+				try:
+					data = self._queue.get(block=True, timeout=2)
+					if not data or self._interrupt:
+						break
 
-			logger.debug("End recording")
+					yield data
+				except queue.Empty:
+					break
 
+			stream.stop_stream()
+			logger.debug("Recording: End")
 			stream.close()
 
 			if self._state_callback:
 				self._state_callback(False)
 
 			if debug:
 				with open(self._tmp_path + 'recording.wav', 'wb') as rf:
-					rf.write(audio)
+					rf.write(self._callback_data['audio'])
 
 			self._recording_lock_inverted.set()