Skip to content

Commit

Permalink
feat: sync whisper.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Nov 7, 2023
1 parent 20c4401 commit 7faec99
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 31 deletions.
12 changes: 6 additions & 6 deletions cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,6 @@ void wsp_ggml_print_backtrace(void) {
}
#endif

#undef MIN
#undef MAX

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

/*#define WSP_GGML_PERF*/
#define WSP_GGML_DEBUG 0
#define WSP_GGML_GELU_FP16
Expand Down Expand Up @@ -277,6 +271,12 @@ inline static void * wsp_ggml_aligned_malloc(size_t size) {
// floating point type used to accumulate sums
typedef double wsp_ggml_float;

#undef MIN
#undef MAX

#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

//
// global data
//
Expand Down
54 changes: 40 additions & 14 deletions cpp/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,15 @@ enum e_model {
MODEL_LARGE,
};

static const std::map<e_model, std::string> g_model_name = {
{ MODEL_UNKNOWN, "unknown" },
{ MODEL_TINY, "tiny" },
{ MODEL_BASE, "base" },
{ MODEL_SMALL, "small" },
{ MODEL_MEDIUM, "medium" },
{ MODEL_LARGE, "large" },
};

static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "en", { 0, "english", } },
{ "zh", { 1, "chinese", } },
Expand Down Expand Up @@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "ba", { 96, "bashkir", } },
{ "jw", { 97, "javanese", } },
{ "su", { 98, "sundanese", } },
{ "yue", { 99, "cantonese", } },
};

static const size_t MB = 1ull*1024*1024;
Expand Down Expand Up @@ -402,7 +412,11 @@ struct whisper_vocab {
id token_beg = 50363; // begin timestamps

bool is_multilingual() const {
return n_vocab == 51865;
return n_vocab >= 51865;
}

int num_languages() const {
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
}
};

Expand Down Expand Up @@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

assert(hparams.n_text_state == hparams.n_audio_state);

std::string mver = "";

if (hparams.n_audio_layer == 4) {
model.type = e_model::MODEL_TINY;
}
Expand All @@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

if (hparams.n_audio_layer == 32) {
model.type = e_model::MODEL_LARGE;

if (hparams.n_vocab == 51866) {
mver = " v3";
}
}

const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR;
Expand Down Expand Up @@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
log("%s: qntvr = %d\n", __func__, qntvr);
log("%s: type = %d\n", __func__, model.type);
log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());

// print memory requirements
{
Expand Down Expand Up @@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
vocab.token_translate++;
vocab.token_transcribe++;
vocab.token_solm++;
vocab.token_prev++;
vocab.token_nosp++;
vocab.token_not++;
vocab.token_beg++;

// account for variable number of language tokens
const int dt = vocab.num_languages() - 98;

vocab.token_translate += dt;
vocab.token_transcribe += dt;
vocab.token_solm += dt;
vocab.token_prev += dt;
vocab.token_nosp += dt;
vocab.token_not += dt;
vocab.token_beg += dt;
}

if (n_vocab < model.hparams.n_vocab) {
Expand Down Expand Up @@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
vocab.id_to_token[i] = word;
}
}

log("%s: n_langs = %d\n", __func__, vocab.num_languages());
}

size_t ctx_size = 0;
Expand Down Expand Up @@ -3285,7 +3311,7 @@ void whisper_free_params(struct whisper_full_params * params) {
}

int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
Expand All @@ -3299,7 +3325,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int

// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
Expand All @@ -3322,13 +3348,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
// TODO

int whisper_set_mel_with_state(
struct whisper_context * /*ctx*/,
struct whisper_context * ctx,
struct whisper_state * state,
const float * data,
int n_len,
int n_mel) {
if (n_mel != WHISPER_N_MEL) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
if (n_mel != ctx->model.filters.n_mel) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
return -1;
}

Expand Down
1 change: 0 additions & 1 deletion cpp/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@

#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30

Expand Down
10 changes: 5 additions & 5 deletions scripts/whisper.cpp.patch
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--- whisper.cpp.orig 2023-11-07 08:15:04
+++ whisper.cpp 2023-11-07 08:20:02
@@ -2855,7 +2855,9 @@
--- whisper.cpp.orig 2023-11-08 05:39:06
+++ whisper.cpp 2023-11-08 05:39:07
@@ -2881,7 +2881,9 @@
log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}

Expand All @@ -10,15 +10,15 @@
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);

log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
@@ -2870,6 +2872,7 @@
@@ -2896,6 +2898,7 @@
#endif
} else {
log("%s: Core ML model loaded\n", __func__);
+ }
}
#endif

@@ -3048,6 +3051,7 @@
@@ -3074,6 +3077,7 @@
struct whisper_context_params whisper_context_default_params() {
struct whisper_context_params result = {
/*.use_gpu =*/ true,
Expand Down
6 changes: 3 additions & 3 deletions scripts/whisper.h.patch
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--- whisper.h.orig 2023-11-07 08:14:57
+++ whisper.h 2023-11-07 08:16:07
@@ -81,6 +81,7 @@
--- whisper.h.orig 2023-11-08 05:39:06
+++ whisper.h 2023-11-08 05:39:07
@@ -80,6 +80,7 @@

struct whisper_context_params {
bool use_gpu;
Expand Down
2 changes: 1 addition & 1 deletion src/version.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"version":"1.4.2"}
{"version":"1.4.3"}
2 changes: 1 addition & 1 deletion whisper.cpp

0 comments on commit 7faec99

Please sign in to comment.