Merge pull request #5 from QXIP/whisper

Switch to Whisper.cpp
QXIP · Jul 4, 2024 · 978e45c · 978e45c
2 parents 79efc98 + 54f67f1
commit 978e45c
Show file tree

Hide file tree

Showing 11 changed files with 1,785 additions and 145 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+node_modules
+recording/*
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM node:20
+ENV REC_PATH=/recording
+ENV META_PATH=/meta
+COPY . /app
+WORKDIR /app
+RUN  npm install
+WORKDIR /app/node_modules/whisper-node/lib/whisper.cpp/models
+RUN ./download-ggml-model.sh small.en-tdrz
+WORKDIR /app/node_modules/whisper-node/lib/whisper.cpp/
+RUN make
+WORKDIR /app
+CMD ["nodejs", "sentiment2hep.mjs"]
diff --git a/README.md b/README.md
@@ -1,33 +1,43 @@
 <img src="https://avatars1.githubusercontent.com/u/956313?v=4&s=50">
 
 # RTP:Engine Speech-to-Text Spooler
-Simple RTPEngine Speech-to-Text Spooler using the [Bing Speech API](https://azure.microsoft.com/en-us/services/cognitive-services)
+Simple RTPEngine Speech-to-Text Spooler using Whisper on CPU(s)
 
 ### Usage
-This simple tool assumes a fully working RTPEngine WAV recorder setup and relies on its natural metadata removal pattern to pick, process and clear recording files. A valid Bing Speech-to-Text API key is also required for the demo to work as-is.
-
-### Debug Usage
-```
-nodejs index.js 
-File /recording/0827ab93e5636d54-7310c8bc193850b5-mix.wav has been added
-File /recording/0827ab93e5636d54-7310c8bc193850b5.meta has been removed
-Meta Hit! Seeking Audio at:  /recording/0827ab93e5636d54-7310c8bc193850b5-mix.wav
-service started
-{ RecognitionStatus: 'Success',
-  DisplayText: 'You are currently the only person in this conference I\'ll be assisting you with your increase today please be informed that this call is being recorded and monitored.',
-  Offset: 1800000,
-  Duration: 97700000 }
-File /recording/0827ab93e5636d54-7310c8bc193850b5-mix.wav has been removed
-```
+This simple tool assumes a fully working RTPEngine WAV recorder setup and relies on its natural metadata removal pattern to pick, process and clear recording files.
 
 -----------
 
 ### HEP Usage
 Speech Recognition results can be streamed to **HOMER** or **HEPIC** using the **HEP** Type 100 container.
 
-* Fill in the API and HEP Server details in ```config.js```
-* Run the HEP-enabled version ```nodejs speech2hep.js```
+Both transcription and sentiment analysis is available. Transcription is always on, while sentiment analysis is opt-in via ENV var.
+
+Below is an overview of the defaults and ENV vars
+
+```js
+/**
+ * Environment Variables
+ */
+const HEP_SERVER = process.env.HEP_SERVER || '127.0.0.1'
+const HEP_TRANS = process.env.HEP_TRANS || 'udp4'
+const HEP_PORT = process.env.HEP_PORT || 9060
+const HEP_PASS = process.env.HEP_PASS || '123'
+const HEP_ID = process.env.HEP_ID || 44567
+const sentimentEnabled = process.env.SENTIMENT || 'false'
+const timeout = process.env.TIMEOUT || 8000
+const offset = process.env.OFFSET || 1000
+const debug = process.env.DEBUG || false
+```
+
+
+```bash
+OFFSET=5000 META_PATH=/var/spool/rtpengine REC_PATH=/path/to/RTPEngine/recording_dir HEP_TRANS='udp4' HEP_SERVER='capture.homer.com' HEP_PORT=9060 node sentiment2hep.mjs
+```
+
+* Wait for RTP traffic
 * Watch HEP logs fly out!
+
 ```
 U 172.18.0.2:52593 -> x.x.x.x:9060
 HEP3.%...................
@@ -41,6 +51,3 @@ Y.._...
 * Check your Session for Logs
 ![image](https://user-images.githubusercontent.com/1423657/31454437-b896f4e6-aeb5-11e7-8535-5d8069e0ef86.png)
 
-#### Todo
-* [ ] Use Proc buffer samples in real-time
-* [ ] Integrate more Speech-to-Text APIs
diff --git a/config.js b/config.js
@@ -1,8 +1,11 @@
 var config = {
+  rec_path: process.env.REC_PATH || __dirname + '/recording',
+  meta_path: process.env.META_PATH || __dirname + '/meta',
   hep_config: {
-    debug: true,
-    HEP_SERVER: '127.0.0.1',
-    HEP_PORT: 9060
+    debug: process.env.DEBUG || false,
+    HEP_TRANS: process.env.HEP_TRANS || 'udp4',
+    HEP_SERVER: process.env.HEP_SERVER || '127.0.0.1',
+    HEP_PORT: process.env.HEP_PORT || 9060,
   },
   bing_options: {
     language: 'en-US',

diff --git a/hep.js b/hep.js
@@ -6,27 +6,29 @@ For License details, see LICENSE
 
 var HEPjs = require('hep-js');
 var dgram = require('dgram');
-var socket = dgram.createSocket("udp4");
+const net = require('net')
+const tls = require('node:tls')
 
 var debug = false; 
-var stats = {rcvd: 0, parsed: 0, hepsent: 0, err: 0, heperr: 0 }; 
+var stats = {rcvd: 0, parsed: 0, hepsent: 0, err: 0, heperr: 0 };
+var socketUsers = 0;
+var socket;
 
 var hep_server;
 var hep_port;
 var hep_pass;
 var hep_id;
-
-var socket;
+var transport
 
 module.exports = {
   init:function(config) {
     hep_server = config.HEP_SERVER;
     hep_port = config.HEP_PORT;
     hep_pass = config.HEP_PASS;
     hep_id = config.HEP_ID;
+    transport = config.HEP_TRANS;
     debug = config.debug;
-    socket = dgram.createSocket("udp4");
-    socket = getSocket('udp4'); 
+    socket = getSocket(transport); 
   },
   preHep:function(message) {
     var rcinfo = message.rcinfo;
@@ -44,7 +46,7 @@ module.exports = {
 	    var hrTime = process.hrtime();
 	    var datenow = new Date().getTime();
 	    rcinfo.time_sec = Math.floor( datenow / 1000);
-	    rcinfo.time_usec = datenow - (rcinfo.time_sec*1000);
+	    rcinfo.time_usec = (datenow - (rcinfo.time_sec*1000))*1000;
     }
     // force sequence for split second sequences
     rcinfo.time_usec = new Date().getTime() - (rcinfo.time_sec*1000) + 1;
@@ -58,37 +60,62 @@ module.exports = {
 };
 
 var getSocket = function (type) {
-  if (undefined === socket) {
-    socket = dgram.createSocket(type);
-    socket.on('error', socketErrorHandler);
+  if(debug)console.log('Socket Type =', type);
+  if (undefined === socket && type === 'udp4') {
+      socket = dgram.createSocket(type);
+  } else if (type === 'tcp') {
+    socket = net.connect(hep_port, hep_server)
+  } else if (type === 'tls') {
+  socket = tls.connect(hep_port, hep_server)
+  console.log('TLS Socket', socket)
+}
+
+  var socketErrorHandler = (err)=>{
+    console.log(err);
+    throw(err);
+  }
 
-    /**
-    * Handles socket's 'close' event,
-    * recover socket in case of unplanned closing.
-    */
-    var socketCloseHandler = function () {
+  socket.on('error', socketErrorHandler);
+  /**
+   * Handles socket's 'close' event,
+   * recover socket in case of unplanned closing.
+   */
+  var socketCloseHandler = function () {
       if (socketUsers > 0) {
-        socket = undefined;
-        --socketUsers;
-        getSocket(type);
+          socket = undefined;
+          --socketUsers;
+          getSocket(type);
       }
-    };
-    socket.on('close', socketCloseHandler);
-  }
+  };
+
+  socket.on('close', socketCloseHandler);
+
+
   return socket;
 }
 
-var sendHEP3 = function(msg,rcinfo){
+var sendHEP3 = function(msg,rcinfo) {
   if (rcinfo && msg) {
     try {
       if (debug) console.log('Sending HEP3 Packet to '+ hep_server + ':' + hep_port + '...');
       if (! typeof msg === 'string' || ! msg instanceof String) msg = JSON.stringify(msg);
       var hep_message = HEPjs.encapsulate(msg.toString(),rcinfo);
       stats.parsed++;
       if (hep_message && hep_message.length) {
-        socket.send(hep_message, 0, hep_message.length, hep_port, hep_server, function(err) {
-          stats.hepsent++;
-        });
+        if(socket && transport == 'udp4') {
+          socket.send(hep_message, 0, hep_message.length, hep_port, hep_server, function(err) {
+           stats.hepsent++;
+           });
+        } else {
+          socket.write(hep_message, function(err) {
+            if(!err){
+              stats.hepsent++;
+            } else {
+              if(debug) console.log('tcp socket err: ', err);
+              stats.err++;
+            }
+  				});
+        }
       } else { console.log('HEP Parsing error!'); stats.heperr++; }
     } 
     catch (e) {

diff --git a/index.js b/index.js
@@ -3,30 +3,30 @@
 
 const fs = require('fs');
 const chokidar = require('chokidar');
-const speechService = require('ms-bing-speech-service');
-
+const whisper = require('whisper-node').whisper;
 const options = {
-  language: 'en-US',
-  subscriptionKey: 'YOUR_OWN_KEY_HERE' // https://azure.microsoft.com/en-us/services/cognitive-services
-};
-
-const socket = new speechService(options);
-const watcher = chokidar.watch('/recording', {ignored: /^\./, persistent: true });
-  watcher
-    .on('error', function(error) {console.error('Error happened', error);})
-    .on('add', function(path) {console.log('File', path, 'has been added');  })
-    .on('unlink', function(path) {console.log('File', path, 'has been removed');
-	   if(path.endsWith('.meta')){ 
-	      var newpath = path.replace(/\.meta/i, '-mix.wav');
-	      console.log('Meta Hit! Seeking Audio at: ',newpath);
-	      socket.start((error, service) => {
-		console.log('Speech service started');
-		service.on('recognition', (e) => {
-		  if (e.RecognitionStatus === 'Success') console.log(e);
-		});
-		service.sendFile(newpath, function(e){ console.log(e);
-		  setTimeout(function(){ fs.unlink(newpath); }, 1000);
-		});
-	      });
-	   }
-  });
+	modelName: "base.en",       // default
+	// modelPath: "/custom/path/to/model.bin", // use model in a custom directory (cannot use along with 'modelName')
+	whisperOptions: {
+	  language: 'auto',          // default (use 'auto' for auto detect)
+	  gen_file_txt: false,      // outputs .txt file
+	  gen_file_subtitle: false, // outputs .srt file
+	  gen_file_vtt: false,      // outputs .vtt file
+	  word_timestamps: false,     // timestamp for every word
+	  timestamp_size: 0      // cannot use along with word_timestamps:true
+	}
+  }
+console.log('Starting watcher', whisper)
+const watcher = chokidar.watch(__dirname + '/recording', {ignored: /^\./, persistent: true });
+watcher
+.on('error', function(error) {console.error('Error happened', error);})
+.on('add', function(path) {console.log('File', path, 'has been added');  })
+.on('unlink', async function(path) {
+	console.log('File', path, 'has been removed');
+	if(path.endsWith('.meta')){ 
+		var newpath = path.replace(/\.meta/i, '-mix.wav');
+		console.log('Meta Hit! Seeking Audio at: ',newpath);
+		const transcript = await whisper(newpath, options);
+		console.log('Meta Hit! Transcript: ', transcript);
+	}
+});