From 6eb45d551e8964fba5e2907ae15d8986927d33b6 Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 10:51:07 +0100 Subject: [PATCH 1/7] Update whisper cpp to version 1.7.2 --- whisper_cpp_vendor/CMakeLists.txt | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/whisper_cpp_vendor/CMakeLists.txt b/whisper_cpp_vendor/CMakeLists.txt index e73bc0d..8253525 100644 --- a/whisper_cpp_vendor/CMakeLists.txt +++ b/whisper_cpp_vendor/CMakeLists.txt @@ -10,7 +10,7 @@ include(FetchContent) find_package(ament_cmake REQUIRED) set(WHISPER_VERSION_MAJOR 1 CACHE STRING "Major whisper.cpp version.") -set(WHISPER_VERSION_MINOR 6 CACHE STRING "Minor whisper.cpp version.") +set(WHISPER_VERSION_MINOR 7 CACHE STRING "Minor whisper.cpp version.") set(WHISPER_VERSION_PATCH 2 CACHE STRING "Patch whisper.cpp version.") FetchContent_Declare( @@ -34,20 +34,38 @@ set_target_properties( # install ggml header install( - FILES ${whisper_SOURCE_DIR}/ggml.h + FILES ${whisper_SOURCE_DIR}/ggml/include/ggml.h DESTINATION include ) ############## # end of fixes ############## +# Export header files for downstream packages +install( + DIRECTORY + ${whisper_SOURCE_DIR}/include/ + ${whisper_SOURCE_DIR}/ggml/include/ + DESTINATION include +) + +install( + TARGETS ggml + EXPORT export_whisper + LIBRARY DESTINATION lib + INCLUDES DESTINATION include +) ament_export_targets(export_whisper HAS_LIBRARY_TARGET) +# Install the library install( TARGETS whisper EXPORT export_whisper LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin INCLUDES DESTINATION include + PUBLIC_HEADER DESTINATION include ) ament_package() From 1fae935d0eceb5c4d82c7243efa5957ad8d7fb7b Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 10:58:43 +0100 Subject: [PATCH 2/7] Update whisper cpp to version 1.7.2 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63033ef..70aa0ee 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ This example shows live transcription of first minute of the 6'th chapter in *** ```shell mkdir -p ros-ai/src && cd ros-ai/src && \ git clone https://github.com/ros-ai/ros2_whisper.git && cd .. && \ -colcon build --symlink-install --cmake-args -DWHISPER_CUDA=On --no-warn-unused-cli +colcon build --symlink-install --cmake-args -DGGML_CUDA=On --no-warn-unused-cli ``` ## Demos From cdbcb0e8b3c3d15f94c6ccf1b9cb9276cef4db59 Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 11:22:55 +0100 Subject: [PATCH 3/7] Added necessary whisper.cpp mutex --- .../include/whisper_server/inference.hpp | 2 +- whisper_server/src/inference.cpp | 40 +++++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/whisper_server/include/whisper_server/inference.hpp b/whisper_server/include/whisper_server/inference.hpp index d1510ff..ecb1916 100644 --- a/whisper_server/include/whisper_server/inference.hpp +++ b/whisper_server/include/whisper_server/inference.hpp @@ -6,7 +6,7 @@ #include #include #include -// #include +#include #include "rcl_interfaces/msg/set_parameters_result.hpp" #include "rclcpp/rclcpp.hpp" diff --git a/whisper_server/src/inference.cpp b/whisper_server/src/inference.cpp index 109244b..1edbbef 100644 --- a/whisper_server/src/inference.cpp +++ b/whisper_server/src/inference.cpp @@ -148,24 +148,30 @@ whisper_idl::msg::WhisperTokens Inference::create_message_() { } bool Inference::run_inference_(whisper_idl::msg::WhisperTokens &result) { - const auto& [data, timestamp] = audio_ring_->peak(); - result.stamp = chrono_to_ros_msg(timestamp); - - inference_(data, result); - - // Print warning if inference takes too long for audio size - auto duration = std::chrono::milliseconds(result.inference_duration); - auto max_runtime_for_audio_size = whisper::count_to_time(data.size()); - if ( duration > max_runtime_for_audio_size ){ - auto timeout_duration_ms = max_runtime_for_audio_size.count(); - RCLCPP_WARN(get_logger(), - "Inference took longer than audio buffer size. This leads to un-inferenced audio " - "data. Consider increasing thread number or compile with accelerator support. \n " - "\t Inference Duration: %lld, Timeout after %lld", - static_cast(duration.count()), - static_cast(timeout_duration_ms)); + if ( whisper_mutex_.try_lock() ) { + const auto& [data, timestamp] = audio_ring_->peak(); + result.stamp = chrono_to_ros_msg(timestamp); + + inference_(data, result); + + // Print warning if inference takes too long for audio size + auto duration = std::chrono::milliseconds(result.inference_duration); + auto max_runtime_for_audio_size = whisper::count_to_time(data.size()); + if ( duration > max_runtime_for_audio_size ){ + auto timeout_duration_ms = max_runtime_for_audio_size.count(); + RCLCPP_WARN(get_logger(), + "Inference took longer than audio buffer size. This leads to un-inferenced audio " + "data. Consider increasing thread number or compile with accelerator support. \n " + "\t Inference Duration: %lld, Timeout after %lld", + static_cast(duration.count()), + static_cast(timeout_duration_ms)); + } + whisper_mutex_.unlock(); + return true; + } else { + RCLCPP_INFO(get_logger(), "Whisper.cpp busy, skipping inference"); + return false; } - return true; } void Inference::on_audio_debug_print_(const std_msgs::msg::Int16MultiArray::SharedPtr msg) { From 3983d1bfb1e198475308a9aacdbc465578ff6443 Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 11:34:06 +0100 Subject: [PATCH 4/7] Removed un-needed constructors --- .../include/transcript_manager/tokens.hpp | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/transcript_manager/include/transcript_manager/tokens.hpp b/transcript_manager/include/transcript_manager/tokens.hpp index eb21ef2..a6576a8 100644 --- a/transcript_manager/include/transcript_manager/tokens.hpp +++ b/transcript_manager/include/transcript_manager/tokens.hpp @@ -13,7 +13,6 @@ class SingleToken { private: std::string data_; float prob_; - int token_id_; public: std::string get_data() const { @@ -26,34 +25,6 @@ class SingleToken { SingleToken(const std::string& data_, float prob_) : data_(data_), prob_(prob_) {}; - - // Copy constructor - SingleToken(const SingleToken& other) - : data_(other.data_), prob_(other.prob_), token_id_(other.token_id_) {}; - - // Move constructor - SingleToken(SingleToken&& other) noexcept - : data_(std::move(other.data_)), prob_(other.prob_), token_id_(other.token_id_) {}; - - // Copy assignment operator - SingleToken& operator=(const SingleToken& other) { - if ( this != &other ) { - data_ = other.data_; - prob_ = other.prob_; - token_id_ = other.token_id_; - } - return *this; - } - - // Move assignment operator - SingleToken& operator=(SingleToken&& other) noexcept { - if ( this != &other ) { - data_ = std::move(other.data_); - prob_ = other.prob_; - token_id_ = other.token_id_; - } - return *this; - } }; } // end of namespace whisper From d7c1ac49dc7befc6c64aef0de2499f83a2237502 Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 15:13:04 +0100 Subject: [PATCH 5/7] Update to version 1.4.0 --- CHANGELOG.rst | 35 +++++++++++++++++++++++++ audio_listener/package.xml | 2 +- audio_listener/setup.py | 2 +- transcript_manager/package.xml | 6 ++--- whisper_bringup/launch/replay.launch.py | 9 ------- whisper_bringup/package.xml | 2 +- whisper_cpp_vendor/package.xml | 2 +- whisper_demos/package.xml | 2 +- whisper_demos/setup.py | 2 +- whisper_idl/package.xml | 2 +- whisper_server/package.xml | 2 +- whisper_util/package.xml | 2 +- 12 files changed, 47 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c7a3c45..b24ec4f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,43 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Changelog for package ROS 2 Whisper ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +## 1.4.0 (2024-12-11) + +- `whisper_cpp_vendor`: `whisper.cpp` 1.6.2 to 1.7.2 release, build changes + +- Added live audio transcription streaming + +- `whisper_server`: changes: + - Holding incoming Audio data in a Ring Buffer (removed BatchBuffer, drop oldest audio). + - Transcribing the entire buffer of audio data with whisper.cpp on a timer interrupt + - Publishing the resulting tokens + probabilities on topic `/whisper/tokens` + - Removing the Action Server + - New Node Parameters: + - `active` -- Boolean to control if whisper.cpp should be run or not. + - `callback_ms` -- Integer controlling how often whisper.cpp is called. + - `buffer_capacity` -- Integer number of seconds previous where audio is transcribed. + +- `transcript_manager`: Package added to: + + - Store record of what was previously transcribed. + - Track what is currently being transcribed. Align and update the text from subscribed topic `/whisper/tokens`. + - Updates done on timer interrupt + - Host the Action Server which was previously part of `whisper_server` + - Publish the entire transcript (previous and current) under `/whisper/transcript_stream` + - Published transcript contains text and estimated segment markings, segment timestamps + +- `whisper_demos`: Add `stream` node + +- `whisper_idl`: Added `msg/WhisperTokens.msg`, `msg/AudioTranscript.msg` + +- `whisper_idl`: Added `launch/replay.launch.py` which does not bring up `audio_listener` + +- `whisper_util`: Changes to directly inference and then serialize whisper.cpp model output, also containing probability data. + 1.3.1 (2024-07-01) ------------------ + * `whisper_msgs`: Changed to `whisper_idl` package * `whisper_bringup`: Changed executor to `MultiThreadedExecutor` so audio and inference can run in parallel on `whisper_server` diff --git a/audio_listener/package.xml b/audio_listener/package.xml index d830bf3..b31b2bb 100644 --- a/audio_listener/package.xml +++ b/audio_listener/package.xml @@ -2,7 +2,7 @@ audio_listener - 1.3.1 + 1.4.0 Audio common replica. mhubii MIT diff --git a/audio_listener/setup.py b/audio_listener/setup.py index 37cc54d..7a8ee6c 100644 --- a/audio_listener/setup.py +++ b/audio_listener/setup.py @@ -4,7 +4,7 @@ setup( name=package_name, - version="1.3.1", + version="1.4.0", packages=find_packages(exclude=["test"]), data_files=[ ("share/ament_index/resource_index/packages", ["resource/" + package_name]), diff --git a/transcript_manager/package.xml b/transcript_manager/package.xml index fb7bd29..9b99cf4 100644 --- a/transcript_manager/package.xml +++ b/transcript_manager/package.xml @@ -2,10 +2,10 @@ transcript_manager - 0.0.1 + 1.4.0 Fuse overlapping whisper inference results into a single transcirpt. - nathan - Apache-2.0 + nathan + MIT ament_cmake diff --git a/whisper_bringup/launch/replay.launch.py b/whisper_bringup/launch/replay.launch.py index 4efd7b4..30598c1 100644 --- a/whisper_bringup/launch/replay.launch.py +++ b/whisper_bringup/launch/replay.launch.py @@ -20,15 +20,6 @@ def generate_launch_description() -> LaunchDescription: ld = LaunchDescription() - # launch audio listener - # ld.add_action( - # Node( - # package="audio_listener", - # executable="audio_listener", - # output="screen", - # ) - # ) - # launch whisper whisper_config = os.path.join( get_package_share_directory("whisper_server"), "config", "whisper.yaml" diff --git a/whisper_bringup/package.xml b/whisper_bringup/package.xml index 2d9d12f..3a71952 100644 --- a/whisper_bringup/package.xml +++ b/whisper_bringup/package.xml @@ -2,7 +2,7 @@ whisper_bringup - 1.3.1 + 1.4.0 TODO: Package description mhubii MIT diff --git a/whisper_cpp_vendor/package.xml b/whisper_cpp_vendor/package.xml index b5d90b2..79c6077 100644 --- a/whisper_cpp_vendor/package.xml +++ b/whisper_cpp_vendor/package.xml @@ -2,7 +2,7 @@ whisper_cpp_vendor - 1.3.1 + 1.4.0 Vendor package for whisper.cpp. mhubii MIT diff --git a/whisper_demos/package.xml b/whisper_demos/package.xml index aa8b19e..e1316ec 100644 --- a/whisper_demos/package.xml +++ b/whisper_demos/package.xml @@ -2,7 +2,7 @@ whisper_demos - 1.3.1 + 1.4.0 Demos for using the ROS 2 whisper package. mhubii MIT diff --git a/whisper_demos/setup.py b/whisper_demos/setup.py index 06f9059..78fe9b3 100644 --- a/whisper_demos/setup.py +++ b/whisper_demos/setup.py @@ -4,7 +4,7 @@ setup( name=package_name, - version="1.3.1", + version="1.4.0", packages=find_packages(exclude=["test"]), data_files=[ ("share/ament_index/resource_index/packages", ["resource/" + package_name]), diff --git a/whisper_idl/package.xml b/whisper_idl/package.xml index 79a0331..730ceda 100644 --- a/whisper_idl/package.xml +++ b/whisper_idl/package.xml @@ -2,7 +2,7 @@ whisper_idl - 1.3.1 + 1.4.0 Messages for the ROS 2 whisper package mhubii MIT diff --git a/whisper_server/package.xml b/whisper_server/package.xml index 27fdefa..62b770c 100644 --- a/whisper_server/package.xml +++ b/whisper_server/package.xml @@ -2,7 +2,7 @@ whisper_server - 1.3.1 + 1.4.0 ROS 2 whisper.cpp inference server. mhubii MIT diff --git a/whisper_util/package.xml b/whisper_util/package.xml index bb5d697..b781658 100644 --- a/whisper_util/package.xml +++ b/whisper_util/package.xml @@ -2,7 +2,7 @@ whisper_util - 1.3.1 + 1.4.0 ROS 2 wrapper for whisper.cpp. mhubii MIT From c64e31a33e376baf13aa1fe3843ca033702c16cf Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Wed, 11 Dec 2024 16:01:01 +0100 Subject: [PATCH 6/7] Fix changelog from markdown to reStructured text --- CHANGELOG.rst | 55 +++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b24ec4f..cf471ed 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,38 +2,33 @@ Changelog for package ROS 2 Whisper ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -## 1.4.0 (2024-12-11) - -- `whisper_cpp_vendor`: `whisper.cpp` 1.6.2 to 1.7.2 release, build changes - -- Added live audio transcription streaming - -- `whisper_server`: changes: - - Holding incoming Audio data in a Ring Buffer (removed BatchBuffer, drop oldest audio). - - Transcribing the entire buffer of audio data with whisper.cpp on a timer interrupt - - Publishing the resulting tokens + probabilities on topic `/whisper/tokens` - - Removing the Action Server - - New Node Parameters: - - `active` -- Boolean to control if whisper.cpp should be run or not. - - `callback_ms` -- Integer controlling how often whisper.cpp is called. - - `buffer_capacity` -- Integer number of seconds previous where audio is transcribed. - -- `transcript_manager`: Package added to: - - - Store record of what was previously transcribed. - - Track what is currently being transcribed. Align and update the text from subscribed topic `/whisper/tokens`. - - Updates done on timer interrupt - - Host the Action Server which was previously part of `whisper_server` - - Publish the entire transcript (previous and current) under `/whisper/transcript_stream` - - Published transcript contains text and estimated segment markings, segment timestamps - -- `whisper_demos`: Add `stream` node - -- `whisper_idl`: Added `msg/WhisperTokens.msg`, `msg/AudioTranscript.msg` +1.4.0 (2024-12-11) +------------------ -- `whisper_idl`: Added `launch/replay.launch.py` which does not bring up `audio_listener` +* `whisper_cpp_vendor`: `whisper.cpp` 1.6.2 to 1.7.2 release, build changes +* Added live audio transcription streaming +* `whisper_server` changes: + * Holding incoming Audio data in a Ring Buffer (removed BatchBuffer, drop oldest audio). + * Transcribing the entire buffer of audio data with whisper.cpp on a timer interrupt + * Publishing the resulting tokens + probabilities on topic `/whisper/tokens` + * Removing the Action Server + * New Node Parameters: + * `active` -- Boolean to control if whisper.cpp should be run or not. + * `callback_ms` -- Integer controlling how often whisper.cpp is called. + * `buffer_capacity` -- Integer number of seconds previous where audio is transcribed. +* `transcript_manager` package added to: + * Store record of what was previously transcribed. + * Track what is currently being transcribed. Align and update the text from subscribed topic `/whisper/tokens`. + * Updates done on timer interrupt + * Host the Action Server which was previously part of `whisper_server` + * Publish the entire transcript (previous and current) under `/whisper/transcript_stream` + * Published transcript contains text and estimated segment markings, segment timestamps +* `whisper_demos`: Add `stream` node +* `whisper_idl`: + * Added `msg/WhisperTokens.msg`, `msg/AudioTranscript.msg` + * Added `launch/replay.launch.py` which does not bring up `audio_listener` +* `whisper_util`: Changes to directly inference and then serialize whisper.cpp model output, also containing probability data. -- `whisper_util`: Changes to directly inference and then serialize whisper.cpp model output, also containing probability data. 1.3.1 (2024-07-01) ------------------ From 4d09999ac934b4e43c83ed0824bdd7c13424e8ab Mon Sep 17 00:00:00 2001 From: Nathan Corral Date: Thu, 12 Dec 2024 15:16:50 +0100 Subject: [PATCH 7/7] Unnecessary compiler link to runtime executables and static libraries --- whisper_cpp_vendor/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/whisper_cpp_vendor/CMakeLists.txt b/whisper_cpp_vendor/CMakeLists.txt index 8253525..110cb37 100644 --- a/whisper_cpp_vendor/CMakeLists.txt +++ b/whisper_cpp_vendor/CMakeLists.txt @@ -62,8 +62,6 @@ install( TARGETS whisper EXPORT export_whisper LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib - RUNTIME DESTINATION bin INCLUDES DESTINATION include PUBLIC_HEADER DESTINATION include )