Merge pull request #19 from ros-ai/dev-whisper_cpp-1.7.2

Release 1.4.0
ros-ai · Dec 13, 2024 · 59b2d73 · 59b2d73
2 parents e6909bf + 4d09999
commit 59b2d73
Show file tree

Hide file tree

Showing 17 changed files with 85 additions and 71 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,8 +1,38 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Changelog for package ROS 2 Whisper
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1.4.0 (2024-12-11)
+------------------
+
+* `whisper_cpp_vendor`: `whisper.cpp` 1.6.2 to 1.7.2 release, build changes
+* Added live audio transcription streaming
+* `whisper_server` changes:
+   * Holding incoming Audio data in a Ring Buffer (removed BatchBuffer, drop oldest audio).
+   * Transcribing the entire buffer of audio data with whisper.cpp on a timer interrupt
+   * Publishing the resulting tokens + probabilities on topic `/whisper/tokens`
+   * Removing the Action Server
+   * New Node Parameters:
+       * `active` -- Boolean to control if whisper.cpp should be run or not.
+       * `callback_ms` -- Integer controlling how often whisper.cpp is called.
+       * `buffer_capacity` -- Integer number of seconds previous where audio is transcribed.
+* `transcript_manager` package added to:
+   * Store record of what was previously transcribed.
+   * Track what is currently being transcribed. Align and update the text from subscribed topic `/whisper/tokens`.
+       * Updates done on timer interrupt
+   * Host the Action Server which was previously part of `whisper_server`
+   * Publish the entire transcript (previous and current) under `/whisper/transcript_stream`
+       * Published transcript contains text and estimated segment markings, segment timestamps
+* `whisper_demos`: Add `stream` node
+* `whisper_idl`:
+   * Added `msg/WhisperTokens.msg`, `msg/AudioTranscript.msg`
+   * Added `launch/replay.launch.py` which does not bring up `audio_listener`
+* `whisper_util`: Changes to directly inference and then serialize whisper.cpp model output, also containing probability data.
+
+
 1.3.1 (2024-07-01)
 ------------------
+
 * `whisper_msgs`: Changed to `whisper_idl` package
 * `whisper_bringup`: Changed executor to `MultiThreadedExecutor` so audio and inference can run in parallel on `whisper_server`
 

diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ This example shows live transcription of first minute of the 6'th chapter in ***
 ```shell
 mkdir -p ros-ai/src && cd ros-ai/src && \
 git clone https://github.com/ros-ai/ros2_whisper.git && cd .. && \
-colcon build --symlink-install --cmake-args -DWHISPER_CUDA=On --no-warn-unused-cli
+colcon build --symlink-install --cmake-args -DGGML_CUDA=On --no-warn-unused-cli
 ```
 
 ## Demos

diff --git a/audio_listener/package.xml b/audio_listener/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>audio_listener</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>Audio common replica.</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/audio_listener/setup.py b/audio_listener/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name=package_name,
-    version="1.3.1",
+    version="1.4.0",
     packages=find_packages(exclude=["test"]),
     data_files=[
         ("share/ament_index/resource_index/packages", ["resource/" + package_name]),

diff --git a/transcript_manager/include/transcript_manager/tokens.hpp b/transcript_manager/include/transcript_manager/tokens.hpp
@@ -13,7 +13,6 @@ class SingleToken {
 private:
   std::string data_;
   float prob_;
-  int token_id_;
 
 public:
   std::string get_data() const {
@@ -26,34 +25,6 @@ class SingleToken {
 
   SingleToken(const std::string& data_, float prob_)
         : data_(data_), prob_(prob_) {};
-
-  // Copy constructor
-  SingleToken(const SingleToken& other)
-      : data_(other.data_), prob_(other.prob_), token_id_(other.token_id_) {};
-
-  // Move constructor
-  SingleToken(SingleToken&& other) noexcept
-      : data_(std::move(other.data_)), prob_(other.prob_), token_id_(other.token_id_) {};
-
-  // Copy assignment operator
-  SingleToken& operator=(const SingleToken& other) {
-    if ( this != &other ) {
-      data_ = other.data_;
-      prob_ = other.prob_;
-      token_id_ = other.token_id_;
-    }
-    return *this;
-  }
-
-  // Move assignment operator
-  SingleToken& operator=(SingleToken&& other) noexcept {
-    if ( this != &other ) {
-      data_ = std::move(other.data_);
-      prob_ = other.prob_;
-      token_id_ = other.token_id_;
-    }
-    return *this;
-  }
 };
 
 } // end of namespace whisper

diff --git a/transcript_manager/package.xml b/transcript_manager/package.xml
@@ -2,10 +2,10 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>transcript_manager</name>
-  <version>0.0.1</version>
+  <version>1.4.0</version>
   <description>Fuse overlapping whisper inference results into a single transcirpt.</description>
-  <maintainer email="nathanbcorral@gmail.com">nathan</maintainer>
-  <license>Apache-2.0</license>
+  <maintainer email="nathan.b.corral@gmail.com">nathan</maintainer>
+  <license>MIT</license>
 
   <buildtool_depend>ament_cmake</buildtool_depend>
 

diff --git a/whisper_bringup/launch/replay.launch.py b/whisper_bringup/launch/replay.launch.py
@@ -20,15 +20,6 @@ def generate_launch_description() -> LaunchDescription:
 
     ld = LaunchDescription()
 
-    # launch audio listener
-    # ld.add_action(
-    #     Node(
-    #         package="audio_listener",
-    #         executable="audio_listener",
-    #         output="screen",
-    #     )
-    # )
-
     # launch whisper
     whisper_config = os.path.join(
         get_package_share_directory("whisper_server"), "config", "whisper.yaml"

diff --git a/whisper_bringup/package.xml b/whisper_bringup/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_bringup</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>TODO: Package description</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/whisper_cpp_vendor/CMakeLists.txt b/whisper_cpp_vendor/CMakeLists.txt
@@ -10,7 +10,7 @@ include(FetchContent)
 find_package(ament_cmake REQUIRED)
 
 set(WHISPER_VERSION_MAJOR 1 CACHE STRING "Major whisper.cpp version.")
-set(WHISPER_VERSION_MINOR 6 CACHE STRING "Minor whisper.cpp version.")
+set(WHISPER_VERSION_MINOR 7 CACHE STRING "Minor whisper.cpp version.")
 set(WHISPER_VERSION_PATCH 2 CACHE STRING "Patch whisper.cpp version.")
 
 FetchContent_Declare(
@@ -34,20 +34,36 @@ set_target_properties(
 
 # install ggml header
 install(
-  FILES ${whisper_SOURCE_DIR}/ggml.h
+  FILES ${whisper_SOURCE_DIR}/ggml/include/ggml.h
   DESTINATION include
 )
 ##############
 # end of fixes
 ##############
+# Export header files for downstream packages
+install(
+  DIRECTORY
+    ${whisper_SOURCE_DIR}/include/
+    ${whisper_SOURCE_DIR}/ggml/include/
+  DESTINATION include
+)
+
+install(
+  TARGETS ggml
+  EXPORT export_whisper
+  LIBRARY DESTINATION lib
+  INCLUDES DESTINATION include
+)
 
 ament_export_targets(export_whisper HAS_LIBRARY_TARGET)
 
+# Install the library
 install(
   TARGETS whisper
   EXPORT export_whisper
   LIBRARY DESTINATION lib
   INCLUDES DESTINATION include
+  PUBLIC_HEADER DESTINATION include
 )
 
 ament_package()
diff --git a/whisper_cpp_vendor/package.xml b/whisper_cpp_vendor/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_cpp_vendor</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>Vendor package for whisper.cpp.</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/whisper_demos/package.xml b/whisper_demos/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_demos</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>Demos for using the ROS 2 whisper package.</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/whisper_demos/setup.py b/whisper_demos/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name=package_name,
-    version="1.3.1",
+    version="1.4.0",
     packages=find_packages(exclude=["test"]),
     data_files=[
         ("share/ament_index/resource_index/packages", ["resource/" + package_name]),

diff --git a/whisper_idl/package.xml b/whisper_idl/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_idl</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>Messages for the ROS 2 whisper package</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/whisper_server/include/whisper_server/inference.hpp b/whisper_server/include/whisper_server/inference.hpp
@@ -6,7 +6,7 @@
 #include <numeric>
 #include <stdexcept>
 #include <string>
-// #include <mutex>
+#include <mutex>
 
 #include "rcl_interfaces/msg/set_parameters_result.hpp"
 #include "rclcpp/rclcpp.hpp"

diff --git a/whisper_server/package.xml b/whisper_server/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_server</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>ROS 2 whisper.cpp inference server.</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>

diff --git a/whisper_server/src/inference.cpp b/whisper_server/src/inference.cpp
@@ -148,24 +148,30 @@ whisper_idl::msg::WhisperTokens Inference::create_message_() {
 }
 
 bool Inference::run_inference_(whisper_idl::msg::WhisperTokens &result) {
-  const auto& [data, timestamp] = audio_ring_->peak();
-  result.stamp = chrono_to_ros_msg(timestamp);
-
-  inference_(data, result);
-
-  // Print warning if inference takes too long for audio size
-  auto duration = std::chrono::milliseconds(result.inference_duration);
-  auto max_runtime_for_audio_size = whisper::count_to_time(data.size());
-  if ( duration > max_runtime_for_audio_size ){
-        auto timeout_duration_ms = max_runtime_for_audio_size.count();
-        RCLCPP_WARN(get_logger(),
-              "Inference took longer than audio buffer size. This leads to un-inferenced audio "
-              "data. Consider increasing thread number or compile with accelerator support. \n "
-              "\t Inference Duration:   %lld,  Timeout after  %lld", 
-              static_cast<long long>(duration.count()), 
-              static_cast<long long>(timeout_duration_ms));
+  if ( whisper_mutex_.try_lock() ) {
+    const auto& [data, timestamp] = audio_ring_->peak();
+    result.stamp = chrono_to_ros_msg(timestamp);
+
+    inference_(data, result);
+
+    // Print warning if inference takes too long for audio size
+    auto duration = std::chrono::milliseconds(result.inference_duration);
+    auto max_runtime_for_audio_size = whisper::count_to_time(data.size());
+    if ( duration > max_runtime_for_audio_size ){
+          auto timeout_duration_ms = max_runtime_for_audio_size.count();
+          RCLCPP_WARN(get_logger(),
+                "Inference took longer than audio buffer size. This leads to un-inferenced audio "
+                "data. Consider increasing thread number or compile with accelerator support. \n "
+                "\t Inference Duration:   %lld,  Timeout after  %lld", 
+                static_cast<long long>(duration.count()), 
+                static_cast<long long>(timeout_duration_ms));
+    }
+    whisper_mutex_.unlock();
+    return true;
+  } else {
+    RCLCPP_INFO(get_logger(), "Whisper.cpp busy, skipping inference");
+    return false;
   }
-  return true;
 }
 
 void Inference::on_audio_debug_print_(const std_msgs::msg::Int16MultiArray::SharedPtr msg) {

diff --git a/whisper_util/package.xml b/whisper_util/package.xml
@@ -2,7 +2,7 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>whisper_util</name>
-  <version>1.3.1</version>
+  <version>1.4.0</version>
   <description>ROS 2 wrapper for whisper.cpp.</description>
   <maintainer email="[email protected]">mhubii</maintainer>
   <license>MIT</license>