diff --git a/cpp/ggml.c b/cpp/ggml.c index deb746d..9c079d8 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -143,12 +143,6 @@ void wsp_ggml_print_backtrace(void) { } #endif -#undef MIN -#undef MAX - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - /*#define WSP_GGML_PERF*/ #define WSP_GGML_DEBUG 0 #define WSP_GGML_GELU_FP16 @@ -277,6 +271,12 @@ inline static void * wsp_ggml_aligned_malloc(size_t size) { // floating point type used to accumulate sums typedef double wsp_ggml_float; +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + // // global data // diff --git a/cpp/whisper.cpp b/cpp/whisper.cpp index e5130ee..a84e7dd 100644 --- a/cpp/whisper.cpp +++ b/cpp/whisper.cpp @@ -193,6 +193,15 @@ enum e_model { MODEL_LARGE, }; +static const std::map g_model_name = { + { MODEL_UNKNOWN, "unknown" }, + { MODEL_TINY, "tiny" }, + { MODEL_BASE, "base" }, + { MODEL_SMALL, "small" }, + { MODEL_MEDIUM, "medium" }, + { MODEL_LARGE, "large" }, +}; + static const std::map> g_lang = { { "en", { 0, "english", } }, { "zh", { 1, "chinese", } }, @@ -293,6 +302,7 @@ static const std::map> g_lang = { { "ba", { 96, "bashkir", } }, { "jw", { 97, "javanese", } }, { "su", { 98, "sundanese", } }, + { "yue", { 99, "cantonese", } }, }; static const size_t MB = 1ull*1024*1024; @@ -402,7 +412,11 @@ struct whisper_vocab { id token_beg = 50363; // begin timestamps bool is_multilingual() const { - return n_vocab == 51865; + return n_vocab >= 51865; + } + + int num_languages() const { + return n_vocab - 51765 - (is_multilingual() ? 1 : 0); } }; @@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con assert(hparams.n_text_state == hparams.n_audio_state); + std::string mver = ""; + if (hparams.n_audio_layer == 4) { model.type = e_model::MODEL_TINY; } @@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con if (hparams.n_audio_layer == 32) { model.type = e_model::MODEL_LARGE; + + if (hparams.n_vocab == 51866) { + mver = " v3"; + } } const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR; @@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con log("%s: n_mels = %d\n", __func__, hparams.n_mels); log("%s: ftype = %d\n", __func__, model.hparams.ftype); log("%s: qntvr = %d\n", __func__, qntvr); - log("%s: type = %d\n", __func__, model.type); + log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str()); // print memory requirements { @@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con if (vocab.is_multilingual()) { vocab.token_eot++; vocab.token_sot++; - vocab.token_translate++; - vocab.token_transcribe++; - vocab.token_solm++; - vocab.token_prev++; - vocab.token_nosp++; - vocab.token_not++; - vocab.token_beg++; + + // account for variable number of language tokens + const int dt = vocab.num_languages() - 98; + + vocab.token_translate += dt; + vocab.token_transcribe += dt; + vocab.token_solm += dt; + vocab.token_prev += dt; + vocab.token_nosp += dt; + vocab.token_not += dt; + vocab.token_beg += dt; } if (n_vocab < model.hparams.n_vocab) { @@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con vocab.id_to_token[i] = word; } } + + log("%s: n_langs = %d\n", __func__, vocab.num_languages()); } size_t ctx_size = 0; @@ -3285,7 +3311,7 @@ void whisper_free_params(struct whisper_full_params * params) { } int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3299,7 +3325,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3322,13 +3348,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * // TODO int whisper_set_mel_with_state( - struct whisper_context * /*ctx*/, + struct whisper_context * ctx, struct whisper_state * state, const float * data, int n_len, int n_mel) { - if (n_mel != WHISPER_N_MEL) { - log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL); + if (n_mel != ctx->model.filters.n_mel) { + log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel); return -1; } diff --git a/cpp/whisper.h b/cpp/whisper.h index cf61955..eea563c 100644 --- a/cpp/whisper.h +++ b/cpp/whisper.h @@ -29,7 +29,6 @@ #define WHISPER_SAMPLE_RATE 16000 #define WHISPER_N_FFT 400 -#define WHISPER_N_MEL 80 #define WHISPER_HOP_LENGTH 160 #define WHISPER_CHUNK_SIZE 30 diff --git a/scripts/whisper.cpp.patch b/scripts/whisper.cpp.patch index d2b7130..44e6316 100644 --- a/scripts/whisper.cpp.patch +++ b/scripts/whisper.cpp.patch @@ -1,6 +1,6 @@ ---- whisper.cpp.orig 2023-11-07 08:15:04 -+++ whisper.cpp 2023-11-07 08:20:02 -@@ -2855,7 +2855,9 @@ +--- whisper.cpp.orig 2023-11-08 05:39:06 ++++ whisper.cpp 2023-11-08 05:39:07 +@@ -2881,7 +2881,9 @@ log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -10,7 +10,7 @@ const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model); log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str()); -@@ -2870,6 +2872,7 @@ +@@ -2896,6 +2898,7 @@ #endif } else { log("%s: Core ML model loaded\n", __func__); @@ -18,7 +18,7 @@ } #endif -@@ -3048,6 +3051,7 @@ +@@ -3074,6 +3077,7 @@ struct whisper_context_params whisper_context_default_params() { struct whisper_context_params result = { /*.use_gpu =*/ true, diff --git a/scripts/whisper.h.patch b/scripts/whisper.h.patch index b21ef44..9df401a 100644 --- a/scripts/whisper.h.patch +++ b/scripts/whisper.h.patch @@ -1,6 +1,6 @@ ---- whisper.h.orig 2023-11-07 08:14:57 -+++ whisper.h 2023-11-07 08:16:07 -@@ -81,6 +81,7 @@ +--- whisper.h.orig 2023-11-08 05:39:06 ++++ whisper.h 2023-11-08 05:39:07 +@@ -80,6 +80,7 @@ struct whisper_context_params { bool use_gpu; diff --git a/src/version.json b/src/version.json index f564e33..9b64601 100644 --- a/src/version.json +++ b/src/version.json @@ -1 +1 @@ -{"version":"1.4.2"} \ No newline at end of file +{"version":"1.4.3"} \ No newline at end of file diff --git a/whisper.cpp b/whisper.cpp index 11b5030..6a5d195 160000 --- a/whisper.cpp +++ b/whisper.cpp @@ -1 +1 @@ -Subproject commit 11b503055e1810afd45127b626d823fa7d15d531 +Subproject commit 6a5d195109994b865e1c92a88258ac182399eb64