detect silence #1837
-
Hello Everyone, copierMicToVolumeMeter.copy();
int currentVolume = volumeMeter.volume();
updateVolumeBuffer(currentVolume); with calculateVolumeVariance() i can check if the volume has lot of diversity in the last time. here is the full code #include "audio.h"
#include <Arduino.h>
#include <SPI.h>
#include <WiFi.h>
#include <Wire.h>
#include "AudioTools.h"
#include "AudioTools/AudioCodecs/CodecMP3Helix.h"
#include "AudioTools/AudioCodecs/CodecWAV.h"
#include "AudioTools/AudioLibs/MemoryManager.h"
#include "AudioTools/Concurrency/All.h"
namespace audio {
TaskHandle_t xControllerHandle = NULL;
TaskHandle_t xSpeechDetectHandle = NULL;
WiFiClient client;
AudioInfo info(16000, 1, 16);
BufferRTOS<uint8_t> buffer(200 * 1024);
QueueStream<uint8_t> queue(buffer);
EncodedAudioStream encoder(&queue, new WAVEncoder());
VolumeStream volumeStream(encoder);
I2SStream microphone;
StreamCopy copierMicToQueue(volumeStream, microphone);
HttpRequest http(client);
StreamCopy copierQueueToHttp(http, queue);
VolumeMeter volumeMeter;
StreamCopy copierMicToVolumeMeter(volumeMeter, microphone);
Task writeTask("write", 7 * 1024, 1, 1);
Task readTask("read", 7 * 1024, 1, 1);
Task monitorTask("monitor", 7 * 1024, 1, 1);
TimerHandle_t silenceTimer = nullptr;
const int silenceTimeout = 300;
const int volumeBufferSize = 5; // Adjust based on your desired time window
int volumeBuffer[volumeBufferSize] = {0};
int volumeBufferIndex = 0;
volatile int32_t initialVolume = 0;
int varianceThreshold = 1000; // Adjust this threshold for sensitivity
volatile bool isSpeaking = false;
void calibrateTask(void *pvParameters) {
const int calibrationDuration = 3000; // Calibration time in milliseconds
const int sampleInterval = 100; // Sample interval in milliseconds
int sampleCount = 0;
int32_t volumeSum = 0;
Serial.println("Calibration started...");
uint32_t startTime = millis();
while (millis() - startTime < calibrationDuration) {
int currentVolume = volumeMeter.volume();
volumeSum += currentVolume;
sampleCount++;
Serial.print("Calibrating... Volume: ");
Serial.println(currentVolume);
vTaskDelay(sampleInterval / portTICK_PERIOD_MS);
}
if (sampleCount > 0) {
initialVolume = volumeSum / sampleCount;
}
Serial.print("Calibration complete. Initial Volume: ");
Serial.println(initialVolume);
vTaskDelete(NULL); // Terminate the calibration task
}
// Function to reset the volume buffer
void resetVolumeBuffer() {
for (int i = 0; i < volumeBufferSize; i++) {
volumeBuffer[i] = 0;
}
volumeBufferIndex = 0;
}
// Function to update the volume buffer
void updateVolumeBuffer(int currentVolume) {
volumeBuffer[volumeBufferIndex] = currentVolume;
volumeBufferIndex = (volumeBufferIndex + 1) % volumeBufferSize;
}
// Function to calculate volume variance
int32_t calculateVolumeVariance() {
int32_t sum = 0;
int32_t sumOfSquares = 0;
// Calculate sum and sum of squares
for (int i = 0; i < volumeBufferSize; i++) {
sum += volumeBuffer[i];
sumOfSquares += volumeBuffer[i] * volumeBuffer[i];
}
// Calculate mean and variance using integer math
int32_t mean = sum / volumeBufferSize;
int32_t variance = (sumOfSquares / volumeBufferSize) - (mean * mean);
// Ensure variance is non-negative (possible due to integer rounding)
return variance > 0 ? variance : 0;
}
void startRecord() {
Serial.println("start");
isSpeaking = true;
buffer.clear();
encoder.begin(info);
queue.begin();
readTask.begin([]() {
// Serial.print(".");
copierMicToQueue.copy();
});
Url url("http://ffaerber-ubuntu:8000/stream?chatId=xxxxx");
http.addRequestHeader("Authorization", "Bearer xxxx");
http.header().put(TRANSFER_ENCODING, CHUNKED);
if (!http.processBegin(POST, url, "audio/wav")) {
Serial.println("post failed");
stop();
}
writeTask.begin([]() {
Serial.println("writeTask start");
copierQueueToHttp.copyAll();
Serial.println("writeTask end");
});
monitorTask.begin([]() {
copierMicToVolumeMeter.copy();
int currentVolume = volumeMeter.volume();
updateVolumeBuffer(currentVolume);
int volumeVariance = calculateVolumeVariance();
if (volumeVariance > varianceThreshold) {
if (!isSpeaking) {
isSpeaking = true;
xTimerStop(silenceTimer, 0); // Stop the silence timer
}
} else {
if (isSpeaking) {
isSpeaking = false;
xTimerStart(silenceTimer, 0); // Start the silence timer
}
}
Serial.print("Current Volume: ");
Serial.print(currentVolume);
Serial.print(", Variance: ");
Serial.println(volumeVariance);
vTaskDelay(100 / portTICK_PERIOD_MS);
});
}
void stopRecord() {
Serial.println("stop");
monitorTask.suspend();
readTask.suspend();
while (!buffer.isEmpty()) {
vTaskDelay(100 / portTICK_PERIOD_MS); // Wait and check again
Serial.print('>');
}
http.processEnd();
writeTask.suspend();
resetVolumeBuffer();
http.end();
encoder.end();
queue.end();
writeTask.end();
readTask.end();
monitorTask.end();
}
void silenceTimerCallback(TimerHandle_t xTimer) {
stopRecord();
}
void controller(void *pvParameter) {
uint32_t ulNotificationValue;
const TickType_t xTicksToWait = portMAX_DELAY;
AudioLogger::instance().begin(Serial, AudioLogger::Warning);
// xiao esp32s3 sense microphone Configuration
auto micConfig = microphone.defaultConfig(RX_MODE);
micConfig.copyFrom(info);
micConfig.signal_type = PDM;
micConfig.i2s_format = I2S_PCM;
micConfig.pin_bck = I2S_PIN_NO_CHANGE;
micConfig.pin_ws = 42;
micConfig.pin_data = 41;
microphone.begin(micConfig);
auto volumeStream_cfg = volumeStream.defaultConfig();
volumeStream_cfg.copyFrom(info);
volumeStream_cfg.allow_boost = true;
volumeStream.begin(volumeStream_cfg);
volumeStream.setVolume(12);
volumeMeter.begin(info);
silenceTimer = xTimerCreate("SilenceTimer", pdMS_TO_TICKS(silenceTimeout), pdFALSE, (void *)0, silenceTimerCallback);
if (silenceTimer == nullptr) {
Serial.println("Failed to create silence timer");
vTaskDelete(NULL);
return;
}
xTaskCreatePinnedToCore(calibrateTask, "Calibrate", 4 * 1024, NULL, 1, NULL, APP_CPU_NUM);
while (true) {
if (xTaskNotifyWait(pdFALSE, ULONG_MAX, &ulNotificationValue, xTicksToWait) == pdPASS) {
EventType currentEvent = static_cast<EventType>(ulNotificationValue);
switch (currentEvent) {
case START_RECORD:
startRecord();
break;
case STOP_RECORD:
break;
default:
break;
}
}
vTaskDelay(1000 / portTICK_PERIOD_MS);
}
}
} // namespace audio |
Beta Was this translation helpful? Give feedback.
Replies: 2 comments
-
I think the following logic would make more sense to me:
In regard of your approach: with your current setting you are generating 32000 bytes/second. If you use a default copy size (which is 1024 bytes) this gives a volume for the last 32 ms. If you look at 5 readings, this means you still only look at only the last 160 ms |
Beta Was this translation helpful? Give feedback.
-
yes you are right. here is a much simpler approach. #include "audio.h"
#include <Arduino.h>
#include <SPI.h>
#include <WiFi.h>
#include <Wire.h>
#include "AudioTools.h"
#include "AudioTools/AudioCodecs/CodecMP3Helix.h"
#include "AudioTools/AudioCodecs/CodecWAV.h"
#include "AudioTools/AudioLibs/MemoryManager.h"
#include "AudioTools/Concurrency/All.h"
namespace audio {
TaskHandle_t xControllerHandle = NULL;
TimerHandle_t silenceTimer = nullptr;
// copier1 = microphone > multiOutput
// > volumeStream > encoder > queue
// > volumeMeter
// copier2 = queue > http
WiFiClient client;
AudioInfo info(16000, 1, 16);
BufferRTOS<uint8_t> buffer(200 * 1024);
QueueStream<uint8_t> queue(buffer);
EncodedAudioStream encoder(&queue, new WAVEncoder());
VolumeStream volumeStream(encoder);
VolumeMeter volumeMeter;
I2SStream microphone;
MultiOutput multiOutput;
StreamCopy copier1(multiOutput, microphone);
HttpRequest http(client);
StreamCopy copier2(http, queue);
Task writeTask("write", 7 * 1024, 1, 1);
Task readTask("read", 7 * 1024, 1, 1);
const int volumeThreshold = 1500; // Volume level below which silence is considered
const int silenceTimeoutMs = 1000; // Timeout in milliseconds for stopping recording
void startRecord() {
Serial.println("startRecord");
xTimerStart(silenceTimer, 0);
buffer.clear();
encoder.begin(info);
queue.begin();
readTask.resume();
Url url("http://ffaerber-ubuntu:8000/stream?chatId=xxxx");
http.addRequestHeader("Authorization", "Bearer xxxx");
http.header().put(TRANSFER_ENCODING, CHUNKED);
http.processBegin(POST, url, "audio/wav");
writeTask.resume();
}
void stopRecord() {
Serial.println("stopRecord");
xTimerStop(silenceTimer, 0);
readTask.suspend();
while (!buffer.isEmpty()) {
vTaskDelay(100 / portTICK_PERIOD_MS); // Wait and check again
Serial.print('.');
}
http.processEnd();
writeTask.suspend();
http.end();
encoder.end();
queue.end();
}
void silenceTimerCallback(TimerHandle_t xTimer) {
Serial.println("Silence timeout exceeded. Stopping recording...");
stopRecord();
}
void controller(void *pvParameter) {
uint32_t ulNotificationValue;
const TickType_t xTicksToWait = portMAX_DELAY;
AudioLogger::instance().begin(Serial, AudioLogger::Warning);
multiOutput.add(volumeMeter);
multiOutput.add(volumeStream);
multiOutput.begin(info);
// xiao esp32s3 sense microphone Configuration
auto micConfig = microphone.defaultConfig(RX_MODE);
micConfig.copyFrom(info);
micConfig.signal_type = PDM;
micConfig.i2s_format = I2S_PCM;
micConfig.pin_bck = I2S_PIN_NO_CHANGE;
micConfig.pin_ws = 42;
micConfig.pin_data = 41;
microphone.begin(micConfig);
auto volumeStream_cfg = volumeStream.defaultConfig();
volumeStream_cfg.copyFrom(info);
volumeStream_cfg.allow_boost = true;
volumeStream.begin(volumeStream_cfg);
volumeStream.setVolume(12);
readTask.begin([]() {
copier1.copy();
int currentVolume = volumeMeter.volume();
if (currentVolume > volumeThreshold) {
xTimerReset(silenceTimer, 0);
}
});
readTask.suspend();
writeTask.begin([]() {
copier2.copyAll();
});
writeTask.suspend();
silenceTimer = xTimerCreate("SilenceTimer", pdMS_TO_TICKS(silenceTimeoutMs), pdFALSE, (void *)0, silenceTimerCallback);
if (silenceTimer == nullptr) {
Serial.println("Failed to create silence timer");
vTaskDelete(NULL);
return;
}
while (true) {
if (xTaskNotifyWait(pdFALSE, ULONG_MAX, &ulNotificationValue, xTicksToWait) == pdPASS) {
EventType currentEvent = static_cast<EventType>(ulNotificationValue);
switch (currentEvent) {
case START_RECORD:
startRecord();
break;
default:
break;
}
}
vTaskDelay(1000 / portTICK_PERIOD_MS);
}
}
} // namespace audio
|
Beta Was this translation helpful? Give feedback.
I think the following logic would make more sense to me:
In regard of your approach: with your current setting you are generating 32000 bytes/second. If you use a default copy size (which is 1024 bytes) this gives a volume for the last 32 ms. If you look at 5 readings, this means you still only look at only the last 160 ms