feat: sync whisper.cpp

mybigday · Nov 7, 2023 · 7faec99 · 7faec99
1 parent 20c4401
commit 7faec99
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 31 deletions.
diff --git a/cpp/ggml.c b/cpp/ggml.c
@@ -143,12 +143,6 @@ void wsp_ggml_print_backtrace(void) {
 }
 #endif
 
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 /*#define WSP_GGML_PERF*/
 #define WSP_GGML_DEBUG 0
 #define WSP_GGML_GELU_FP16
@@ -277,6 +271,12 @@ inline static void * wsp_ggml_aligned_malloc(size_t size) {
 // floating point type used to accumulate sums
 typedef double wsp_ggml_float;
 
+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 //
 // global data
 //

diff --git a/cpp/whisper.cpp b/cpp/whisper.cpp
@@ -193,6 +193,15 @@ enum e_model {
     MODEL_LARGE,
 };
 
+static const std::map<e_model, std::string> g_model_name = {
+    { MODEL_UNKNOWN,  "unknown"  },
+    { MODEL_TINY,     "tiny"     },
+    { MODEL_BASE,     "base"     },
+    { MODEL_SMALL,    "small"    },
+    { MODEL_MEDIUM,   "medium"   },
+    { MODEL_LARGE,    "large"    },
+};
+
 static const std::map<std::string, std::pair<int, std::string>> g_lang = {
     { "en",  { 0,  "english",         } },
     { "zh",  { 1,  "chinese",         } },
@@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
     { "ba",  { 96,  "bashkir",        } },
     { "jw",  { 97,  "javanese",       } },
     { "su",  { 98,  "sundanese",      } },
+    { "yue", { 99,  "cantonese",      } },
 };
 
 static const size_t MB = 1ull*1024*1024;
@@ -402,7 +412,11 @@ struct whisper_vocab {
     id token_beg        = 50363; // begin timestamps
 
     bool is_multilingual() const {
-        return n_vocab == 51865;
+        return n_vocab >= 51865;
+    }
+
+    int num_languages() const {
+        return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
     }
 };
 
@@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
         assert(hparams.n_text_state == hparams.n_audio_state);
 
+        std::string mver = "";
+
         if (hparams.n_audio_layer == 4) {
             model.type = e_model::MODEL_TINY;
         }
@@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 
         if (hparams.n_audio_layer == 32) {
             model.type = e_model::MODEL_LARGE;
+
+            if (hparams.n_vocab == 51866) {
+                mver = " v3";
+            }
         }
 
         const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR;
@@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         log("%s: n_mels        = %d\n", __func__, hparams.n_mels);
         log("%s: ftype         = %d\n", __func__, model.hparams.ftype);
         log("%s: qntvr         = %d\n", __func__, qntvr);
-        log("%s: type          = %d\n", __func__, model.type);
+        log("%s: type          = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
 
         // print memory requirements
         {
@@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         if (vocab.is_multilingual()) {
             vocab.token_eot++;
             vocab.token_sot++;
-            vocab.token_translate++;
-            vocab.token_transcribe++;
-            vocab.token_solm++;
-            vocab.token_prev++;
-            vocab.token_nosp++;
-            vocab.token_not++;
-            vocab.token_beg++;
+
+            // account for variable number of language tokens
+            const int dt = vocab.num_languages() - 98;
+
+            vocab.token_translate  += dt;
+            vocab.token_transcribe += dt;
+            vocab.token_solm       += dt;
+            vocab.token_prev       += dt;
+            vocab.token_nosp       += dt;
+            vocab.token_not        += dt;
+            vocab.token_beg        += dt;
         }
 
         if (n_vocab < model.hparams.n_vocab) {
@@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                 vocab.id_to_token[i] = word;
             }
         }
+
+        log("%s: n_langs       = %d\n", __func__, vocab.num_languages());
     }
 
     size_t ctx_size = 0;
@@ -3285,7 +3311,7 @@ void whisper_free_params(struct whisper_full_params * params) {
 }
 
 int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
         log("%s: failed to compute mel spectrogram\n", __func__);
         return -1;
     }
@@ -3299,7 +3325,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
 
 // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
 int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
         log("%s: failed to compute mel spectrogram\n", __func__);
         return -1;
     }
@@ -3322,13 +3348,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
 // TODO
 
 int whisper_set_mel_with_state(
-        struct whisper_context * /*ctx*/,
+        struct whisper_context * ctx,
           struct whisper_state * state,
                    const float * data,
                            int   n_len,
                            int   n_mel) {
-    if (n_mel != WHISPER_N_MEL) {
-        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
+    if (n_mel != ctx->model.filters.n_mel) {
+        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
         return -1;
     }
 

diff --git a/cpp/whisper.h b/cpp/whisper.h
@@ -29,7 +29,6 @@
 
 #define WHISPER_SAMPLE_RATE 16000
 #define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
 #define WHISPER_HOP_LENGTH  160
 #define WHISPER_CHUNK_SIZE  30
 

diff --git a/scripts/whisper.cpp.patch b/scripts/whisper.cpp.patch
@@ -1,6 +1,6 @@
---- whisper.cpp.orig	2023-11-07 08:15:04
-+++ whisper.cpp	2023-11-07 08:20:02
-@@ -2855,7 +2855,9 @@
+--- whisper.cpp.orig	2023-11-08 05:39:06
++++ whisper.cpp	2023-11-08 05:39:07
+@@ -2881,7 +2881,9 @@
          log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
      }
 
@@ -10,15 +10,15 @@
      const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
 
      log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
-@@ -2870,6 +2872,7 @@
+@@ -2896,6 +2898,7 @@
  #endif
      } else {
          log("%s: Core ML model loaded\n", __func__);
 +    }
      }
  #endif
 
-@@ -3048,6 +3051,7 @@
+@@ -3074,6 +3077,7 @@
  struct whisper_context_params whisper_context_default_params() {
      struct whisper_context_params result = {
          /*.use_gpu    =*/ true,

diff --git a/scripts/whisper.h.patch b/scripts/whisper.h.patch
@@ -1,6 +1,6 @@
---- whisper.h.orig	2023-11-07 08:14:57
-+++ whisper.h	2023-11-07 08:16:07
-@@ -81,6 +81,7 @@
+--- whisper.h.orig	2023-11-08 05:39:06
++++ whisper.h	2023-11-08 05:39:07
+@@ -80,6 +80,7 @@
 
      struct whisper_context_params {
          bool  use_gpu;

diff --git a/src/version.json b/src/version.json
@@ -1 +1 @@
-{"version":"1.4.2"}
+{"version":"1.4.3"}
diff --git a/whisper.cpp b/whisper.cpp
+1 −1		CMakeLists.txt
+2 −1		Makefile
+3 −2		README.md
+1 −1		bindings/go/examples/go-model-download/main.go
+0 −1		bindings/go/whisper.go
+1 −1		bindings/ios
+1 −1		bindings/javascript/package.json
+3 −1		examples/bench.wasm/emscripten.cpp
+3 −1		examples/bench/bench.cpp
+1 −1		examples/livestream.sh
+1 −1		examples/twitch.sh
+2 −14		examples/whisper.android/app/build.gradle
+4 −4		examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+1 −0		examples/whisper.android/lib/.gitignore
+51 −0		examples/whisper.android/lib/build.gradle
+4 −0		examples/whisper.android/lib/src/main/AndroidManifest.xml
+1 −1		examples/whisper.android/lib/src/main/java/com/whispercpp/whisper/LibWhisper.kt
+1 −1		examples/whisper.android/lib/src/main/java/com/whispercpp/whisper/WhisperCpuConfig.kt
+0 −0		examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+9 −9		examples/whisper.android/lib/src/main/jni/whisper/jni.c
+1 −0		examples/whisper.android/settings.gradle
+1 −1		extra/convert-all.sh
+6 −6		ggml.c
+2 −1		models/README.md
+2 −2		models/convert-h5-to-coreml.py
+1 −1		models/convert-pt-to-ggml.py
+3 −3		models/convert-whisper-to-coreml.py
+2 −2		models/convert-whisper-to-openvino.py
+1 −1		models/download-coreml-model.sh
+3 −3		models/download-ggml-model.cmd
+1 −0		models/download-ggml-model.sh
+1 −1		tests/run-tests.sh
+40 −14		whisper.cpp
+0 −1		whisper.h