From a1a2a20c1d380c9654eaaa9e158f1da94952ca8b Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Mon, 13 May 2024 11:14:32 -0400
Subject: [PATCH] chore: Refactor cleanstream-filter-data.h and update
 timestamps in whisper-processing.cpp

---
 src/cleanstream-filter-data.h            |  4 +-
 src/cleanstream-filter.cpp               | 84 ++++++++++++++----------
 src/whisper-utils/whisper-processing.cpp | 27 +++++---
 3 files changed, 71 insertions(+), 44 deletions(-)
diff --git a/src/cleanstream-filter-data.h b/src/cleanstream-filter-data.h
index d7bbfe3..d7d1928 100644
--- a/src/cleanstream-filter-data.h
+++ b/src/cleanstream-filter-data.h
@@ -38,9 +38,9 @@ struct cleanstream_data {
 	uint32_t sample_rate;  // input sample rate
 	// How many input frames (in input sample rate) are needed for the next whisper frame
 	size_t frames;
-	// How many frames were processed in the last whisper frame (this is dynamic)
-	size_t last_num_frames;
 	int current_result;
+	uint64_t current_result_end_timestamp;
+	uint64_t current_result_start_timestamp;
 	uint32_t delay_ms;
 
 	/* Silero VAD */
diff --git a/src/cleanstream-filter.cpp b/src/cleanstream-filter.cpp
index 75bab5e..5cd9159 100644
--- a/src/cleanstream-filter.cpp
+++ b/src/cleanstream-filter.cpp
@@ -58,48 +58,56 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat
 		return audio;
 	}
 
-	std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
+	size_t input_buffer_size = 0;
+	{
+		std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex); // scoped lock
 
-	if (audio != nullptr && audio->frames > 0) {
-		// push back current audio data to input circlebuf
-		for (size_t c = 0; c < gf->channels; c++) {
-			circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
-					    audio->frames * sizeof(float));
+		if (audio != nullptr && audio->frames > 0) {
+			// push back current audio data to input circlebuf
+			for (size_t c = 0; c < gf->channels; c++) {
+				circlebuf_push_back(&gf->input_buffers[c], audio->data[c],
+						    audio->frames * sizeof(float));
+			}
+			// push audio packet info (timestamp/frame count) to info circlebuf
+			struct cleanstream_audio_info info = {0};
+			info.frames = audio->frames;       // number of frames in this packet
+			info.timestamp = audio->timestamp; // timestamp of this packet
+			circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
 		}
-		// push audio packet info (timestamp/frame count) to info circlebuf
-		struct cleanstream_audio_info info = {0};
-		info.frames = audio->frames;       // number of frames in this packet
-		info.timestamp = audio->timestamp; // timestamp of this packet
-		circlebuf_push_back(&gf->info_buffer, &info, sizeof(info));
+		input_buffer_size = gf->input_buffers[0].size;
 	}
 
 	// check the size of the input buffer - if it's more than <delay>ms worth of audio, start playback
-	if (gf->input_buffers[0].size > gf->delay_ms * gf->sample_rate * sizeof(float) / 1000) {
+	if (input_buffer_size > gf->delay_ms * gf->sample_rate * sizeof(float) / 1000) {
 		// find needed number of frames from the incoming audio
 		size_t num_frames_needed = audio->frames;
 
 		std::vector<float> temporary_buffers[MAX_AUDIO_CHANNELS];
 		uint64_t timestamp = 0;
 
-		while (temporary_buffers[0].size() < num_frames_needed) {
-			struct cleanstream_audio_info info_out = {0};
+		{
+			std::lock_guard<std::mutex> lock(gf->whisper_buf_mutex);
 			// pop from input buffers to get audio packet info
-			circlebuf_pop_front(&gf->info_buffer, &info_out, sizeof(info_out));
-			if (timestamp == 0) {
-				timestamp = info_out.timestamp;
-			}
+			while (temporary_buffers[0].size() < num_frames_needed) {
+				struct cleanstream_audio_info info_out = {0};
+				// pop from input buffers to get audio packet info
+				circlebuf_pop_front(&gf->info_buffer, &info_out, sizeof(info_out));
+				if (timestamp == 0) {
+					timestamp = info_out.timestamp;
+				}
 
-			// pop from input circlebuf to audio data
-			for (size_t i = 0; i < gf->channels; i++) {
-				// increase the size of the temporary buffer to hold the incoming audio in addition
-				// to the existing audio on the temporary buffer
-				temporary_buffers[i].resize(temporary_buffers[i].size() +
-							    info_out.frames);
-				circlebuf_pop_front(&gf->input_buffers[i],
-						    temporary_buffers[i].data() +
-							    temporary_buffers[i].size() -
-							    info_out.frames,
-						    info_out.frames * sizeof(float));
+				// pop from input circlebuf to audio data
+				for (size_t i = 0; i < gf->channels; i++) {
+					// increase the size of the temporary buffer to hold the incoming audio in addition
+					// to the existing audio on the temporary buffer
+					temporary_buffers[i].resize(temporary_buffers[i].size() +
+								    info_out.frames);
+					circlebuf_pop_front(&gf->input_buffers[i],
+							    temporary_buffers[i].data() +
+								    temporary_buffers[i].size() -
+								    info_out.frames,
+							    info_out.frames * sizeof(float));
+				}
 			}
 		}
 		const size_t num_frames = temporary_buffers[0].size();
@@ -110,13 +118,17 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat
 		memset(gf->output_data.array, 0, frames_size_bytes * gf->channels);
 
 		int inference_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
+		uint64_t inference_result_start_timestamp = 0;
+		uint64_t inference_result_end_timestamp = 0;
 		{
-			std::lock_guard<std::mutex> lock(gf->whisper_outbuf_mutex);
+			std::lock_guard<std::mutex> outbuf_lock(gf->whisper_outbuf_mutex);
 			inference_result = gf->current_result;
+			inference_result_start_timestamp = gf->current_result_start_timestamp;
+			inference_result_end_timestamp = gf->current_result_end_timestamp;
 		}
 
-		if (inference_result == DetectionResult::DETECTION_RESULT_BEEP) {
-			obs_log(LOG_INFO, "Beep detected, timestamp: %llu", timestamp);
+		if (timestamp > inference_result_start_timestamp &&
+		    timestamp < inference_result_end_timestamp) {
 			if (gf->replace_sound == REPLACE_SOUNDS_SILENCE) {
 				// set the audio to 0
 				for (size_t i = 0; i < gf->channels; i++) {
@@ -220,6 +232,10 @@ void cleanstream_update(void *data, obs_data_t *s)
 	gf->log_level = (int)obs_data_get_int(s, "log_level");
 	gf->vad_enabled = obs_data_get_bool(s, "vad_enabled");
 	gf->log_words = obs_data_get_bool(s, "log_words");
+	gf->delay_ms = BUFFER_SIZE_MSEC + INITIAL_DELAY_MSEC;
+	gf->current_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
+	gf->current_result_start_timestamp = 0;
+	gf->current_result_end_timestamp = 0;
 
 	obs_log(gf->log_level, "update whisper model");
 	update_whisper_model(gf, s);
@@ -270,8 +286,10 @@ void *cleanstream_create(obs_data_t *settings, obs_source_t *filter)
 	gf->channels = audio_output_get_channels(obs_get_audio());
 	gf->sample_rate = audio_output_get_sample_rate(obs_get_audio());
 	gf->frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)BUFFER_SIZE_MSEC));
-	gf->last_num_frames = 0;
 	gf->delay_ms = BUFFER_SIZE_MSEC + INITIAL_DELAY_MSEC;
+	gf->current_result = DetectionResult::DETECTION_RESULT_UNKNOWN;
+	gf->current_result_start_timestamp = 0;
+	gf->current_result_end_timestamp = 0;
 
 	for (size_t i = 0; i < MAX_AUDIO_CHANNELS; i++) {
 		circlebuf_init(&gf->input_buffers[i]);
diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp
index 9bb91a6..3322f43 100644
--- a/src/whisper-utils/whisper-processing.cpp
+++ b/src/whisper-utils/whisper-processing.cpp
@@ -305,6 +305,7 @@ int run_whisper_inference(struct cleanstream_data *gf, const float *pcm32f_data,
 long long process_audio_from_buffer(struct cleanstream_data *gf)
 {
 	uint64_t start_timestamp = 0;
+	uint64_t end_timestamp = 0;
 
 	{
 		// scoped lock the buffer mutex
@@ -312,19 +313,23 @@ long long process_audio_from_buffer(struct cleanstream_data *gf)
 
 		// copy gf->frames from the end of the input buffer to the copy_buffers
 		for (size_t c = 0; c < gf->channels; c++) {
-			circlebuf_peek_front(&gf->input_buffers[c], gf->copy_buffers[c],
-					     gf->frames * sizeof(float));
+			circlebuf_peek_back(&gf->input_buffers[c], gf->copy_buffers[c],
+					    gf->frames * sizeof(float));
 		}
 
-		// peek at the info_buffer to get the timestamp of the first info
+		// peek at the info_buffer to get the timestamp of the last info
 		struct cleanstream_audio_info info_from_buf = {0};
-		circlebuf_peek_front(&gf->info_buffer, &info_from_buf,
-				     sizeof(struct cleanstream_audio_info));
-		start_timestamp = info_from_buf.timestamp;
+		circlebuf_peek_back(&gf->info_buffer, &info_from_buf,
+				    sizeof(struct cleanstream_audio_info));
+		end_timestamp = info_from_buf.timestamp;
+		start_timestamp =
+			end_timestamp - (int)(gf->frames * 1000 / gf->sample_rate) * 1000000;
 	}
 
-	obs_log(gf->log_level, "processing %lu frames (%d ms), start timestamp %llu ", gf->frames,
-		(int)(gf->frames * 1000 / gf->sample_rate), start_timestamp);
+	obs_log(gf->log_level,
+		"processing %lu frames (%d ms), start timestamp %llu, end timestamp %llu ",
+		gf->frames, (int)(gf->frames * 1000 / gf->sample_rate), start_timestamp,
+		end_timestamp);
 
 	// time the audio processing
 	auto start = std::chrono::high_resolution_clock::now();
@@ -361,6 +366,10 @@ long long process_audio_from_buffer(struct cleanstream_data *gf)
 		{
 			std::lock_guard<std::mutex> lock(gf->whisper_outbuf_mutex);
 			gf->current_result = inference_result;
+			if (gf->current_result == DETECTION_RESULT_BEEP) {
+				gf->current_result_start_timestamp = start_timestamp;
+				gf->current_result_end_timestamp = end_timestamp;
+			}
 		}
 	} else {
 		gf->current_result = DETECTION_RESULT_SILENCE;
@@ -377,7 +386,7 @@ long long process_audio_from_buffer(struct cleanstream_data *gf)
 	obs_log(gf->log_level, "audio processing of %u ms new data took %d ms", audio_processed_ms,
 		(int)duration);
 
-	if ((duration + 300) > (gf->delay_ms - audio_processed_ms)) {
+	if (duration > (gf->delay_ms - audio_processed_ms)) {
 		obs_log(gf->log_level,
 			"audio processing (%d ms) longer than delay (%lu ms), increase delay",
 			(int)duration, gf->delay_ms);