From 2a23eedc50b9955bd718de6198f2d91580572bf6 Mon Sep 17 00:00:00 2001 From: PAB Date: Sun, 1 Oct 2023 14:51:41 +0200 Subject: [PATCH] feat: custom allocator + Encodec context (#7) --- encodec.cpp | 234 ++++++++++++++++++++++++++++++++++++++++++++++++---- encodec.h | 118 +++++--------------------- 2 files changed, 236 insertions(+), 116 deletions(-) diff --git a/encodec.cpp b/encodec.cpp index 65a5216..697731d 100644 --- a/encodec.cpp +++ b/encodec.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -8,12 +9,134 @@ #include "encodec.h" #include "ggml.h" +#include "ggml-alloc.h" + + +static const size_t TENSOR_ALIGNMENT = 32; + +// res + downsample block at some ratio +struct encodec_encoder_block { + // conv1 + struct ggml_tensor * conv_1_w; + struct ggml_tensor * conv_1_b; + + // conv2 + struct ggml_tensor * conv_2_w; + struct ggml_tensor * conv_2_b; + + // shortcut + struct ggml_tensor * conv_sc_w; + struct ggml_tensor * conv_sc_b; + + // downsampling layers + struct ggml_tensor * ds_conv_w; + struct ggml_tensor * ds_conv_b; +}; + +struct encodec_lstm { + struct ggml_tensor * l0_ih_w; + struct ggml_tensor * l0_hh_w; + + struct ggml_tensor * l0_ih_b; + struct ggml_tensor * l0_hh_b; + + struct ggml_tensor * l1_ih_w; + struct ggml_tensor * l1_hh_w; + + struct ggml_tensor * l1_ih_b; + struct ggml_tensor * l1_hh_b; +}; + +struct encodec_encoder { + struct ggml_tensor * init_conv_w; + struct ggml_tensor * init_conv_b; + + encodec_lstm lstm; + + struct ggml_tensor * final_conv_w; + struct ggml_tensor * final_conv_b; + + std::vector blocks; +}; + +struct encodec_quant_block { + struct ggml_tensor * inited; + struct ggml_tensor * cluster_size; + struct ggml_tensor * embed; + struct ggml_tensor * embed_avg; +}; + +struct encodec_quantizer { + std::vector blocks; +}; + +struct encodec_decoder_block { + //upsampling layers + struct ggml_tensor * us_conv_w; + struct ggml_tensor * us_conv_b; + + // conv1 + struct ggml_tensor * conv_1_w; + struct ggml_tensor * conv_1_b; + + // conv2 + struct ggml_tensor * conv_2_w; + struct ggml_tensor * conv_2_b; + + // shortcut + struct ggml_tensor * conv_sc_w; + struct ggml_tensor * conv_sc_b; +}; + +struct encodec_decoder { + struct ggml_tensor * init_conv_w; + struct ggml_tensor * init_conv_b; + + encodec_lstm lstm; + + struct ggml_tensor * final_conv_w; + struct ggml_tensor * final_conv_b; + + std::vector blocks; +}; + +struct encodec_model { + encodec_hparams hparams; + + encodec_encoder encoder; + encodec_quantizer quantizer; + encodec_decoder decoder; + + // context + struct ggml_context * ctx; + int n_loaded; + + std::map tensors; +}; template static void read_safe(std::ifstream& infile, T& dest) { infile.read((char*)& dest, sizeof(T)); } +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +static void ggml_disconnect_node_from_graph(ggml_tensor * t) { + t->op = GGML_OP_NONE; + for (int i = 0; i < GGML_MAX_SRC; i++) { + t->src[i] = NULL; + } +} + static void encodec_sigmoid_impl(struct ggml_tensor * dst, const struct ggml_tensor * src, int ith, int nth, void * userdata) { GGML_ASSERT(userdata == NULL); GGML_ASSERT(ggml_are_same_shape(dst, src)); @@ -72,9 +195,9 @@ static struct ggml_tensor * unpad_1d(ggml_context * ctx0, ggml_tensor * inp, int int length = inp->ne[0]; int dim = inp->ne[1]; - ENCODEC_ASSERT(padding_left >= 0); - ENCODEC_ASSERT(padding_right >= 0); - ENCODEC_ASSERT(padding_left + padding_right <= length); + assert(padding_left >= 0); + assert(padding_right >= 0); + assert(padding_left + padding_right <= length); int end = length - padding_right; @@ -515,20 +638,27 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) { return true; } -static void encodec_model_eval( - std::vector& raw_audio, - encodec_model& model, - int n_threads) { - static size_t buf_size = 512u*MB; - static void * buf = malloc(buf_size); +static struct ggml_cgraph * encodec_build_graph( + encodec_context & ectx, + const std::vector & inp_audio) { + const int32_t audio_length = inp_audio.size(); - struct ggml_init_params params = { buf_size, buf, false }; + const auto & model = ectx.model; - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_init_params ggml_params = { + /*.mem_size =*/ ectx.buf_compute.size(), + /*.mem_buffer =*/ ectx.buf_compute.data(), + /*.no_alloc =*/ true, // skip allocating as we use ggml_alloc to allocate exact memory requirements + }; - struct ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, raw_audio.size()); - memcpy(inp->data, raw_audio.data(), raw_audio.size()*ggml_element_size(inp)); + struct ggml_context * ctx0 = ggml_init(ggml_params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * inp = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, audio_length); + ggml_allocr_alloc(ectx.allocr, inp); + if (!ggml_allocr_is_measure(ectx.allocr)) { + memcpy(inp->data, inp_audio.data(), audio_length*ggml_element_size(inp)); + } // encoder struct ggml_tensor * encoded_inp; @@ -662,6 +792,7 @@ static void encodec_model_eval( const int n_q = codes->ne[1]; quantized_out = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length); + quantized_out = ggml_set_zero(quantized_out); for (int i = 0; i < n_q; i++) { encodec_quant_block block = model.quantizer.blocks[i]; @@ -716,11 +847,11 @@ static void encodec_model_eval( ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]); struct ggml_tensor * current = inpL; - + // shortcut struct ggml_tensor * shortcut = strided_conv_1d( ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride); - + // conv1 current = ggml_elu(ctx0, current); @@ -748,8 +879,75 @@ static void encodec_model_eval( out = decoded_inp; } - ggml_build_forward_expand (&gf, out); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); + out = ggml_cpy(ectx.ctx_audio, out, ectx.reconstructed_audio); + + ggml_build_forward_expand(gf, out); + ggml_disconnect_node_from_graph(ectx.reconstructed_audio); ggml_free(ctx0); + + return gf; +} + +static bool encodec_model_eval( + std::vector & raw_audio, + encodec_context & ectx, + int n_threads) { + const int64_t t_start_ms = ggml_time_ms(); + + fprintf(stderr, "%s: raw audio (t=%zu)\n", __func__, raw_audio.size()); + + static const size_t buf_size = 256u*1024*1024; + + if (ectx.ctx_audio) { + ggml_free(ectx.ctx_audio); + ectx.ctx_audio = {}; + } + + struct ggml_init_params ggml_params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + ectx.ctx_audio = ggml_init(ggml_params); + + ectx.reconstructed_audio = ggml_new_tensor_1d(ectx.ctx_audio, GGML_TYPE_F32, raw_audio.size()); + + // reconstruct the audio + ectx.buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + ectx.allocr = ggml_allocr_new_measure(TENSOR_ALIGNMENT); + struct ggml_cgraph * gf_measure = encodec_build_graph(ectx, raw_audio); + if (!gf_measure) { + fprintf(stderr, "%s: failed to build graph\n", __func__); + return false; + } + + size_t alloc_size = ggml_allocr_alloc_graph(ectx.allocr, gf_measure) + TENSOR_ALIGNMENT; + ggml_allocr_free(ectx.allocr); + + // recreate allocator with exact memory requirements + ectx.buf_alloc.resize(alloc_size); + ectx.allocr = ggml_allocr_new(ectx.buf_alloc.data(), ectx.buf_alloc.size(), TENSOR_ALIGNMENT); + + // compute the graph with the measured exact memory requirements from above + ggml_allocr_reset(ectx.allocr); + + struct ggml_cgraph * gf = encodec_build_graph(ectx, raw_audio); + if (!gf) { + fprintf(stderr, "%s: failed to build graph\n", __func__); + return false; + } + + ggml_allocr_alloc_graph(ectx.allocr, gf); + + ggml_graph_compute_helper(ectx.work_buffer, gf, n_threads); + + ggml_allocr_free(ectx.allocr); + ectx.allocr = NULL; + ectx.work_buffer.clear(); + + ectx.t_compute_ms = ggml_time_ms() - t_start_ms; + + return true; } diff --git a/encodec.h b/encodec.h index 854aeec..a7314d3 100644 --- a/encodec.h +++ b/encodec.h @@ -13,14 +13,6 @@ #define ENCODEC_FILE_MAGIC 'ggml' #define ENCODEC_FILE_VERSION 1 -#define ENCODEC_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "ENCODEC_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - static const size_t MB = 4*1024*1024; struct encodec_hparams { @@ -41,102 +33,32 @@ struct encodec_hparams { int32_t sr = 24000; }; -// res + downsample block at some ratio -struct encodec_encoder_block { - // conv1 - struct ggml_tensor * conv_1_w; - struct ggml_tensor * conv_1_b; - - // conv2 - struct ggml_tensor * conv_2_w; - struct ggml_tensor * conv_2_b; - - // shortcut - struct ggml_tensor * conv_sc_w; - struct ggml_tensor * conv_sc_b; - - // downsampling layers - struct ggml_tensor * ds_conv_w; - struct ggml_tensor * ds_conv_b; -}; - -struct encodec_lstm { - struct ggml_tensor * l0_ih_w; - struct ggml_tensor * l0_hh_w; - - struct ggml_tensor * l0_ih_b; - struct ggml_tensor * l0_hh_b; - - struct ggml_tensor * l1_ih_w; - struct ggml_tensor * l1_hh_w; - - struct ggml_tensor * l1_ih_b; - struct ggml_tensor * l1_hh_b; -}; - -struct encodec_encoder { - struct ggml_tensor * init_conv_w; - struct ggml_tensor * init_conv_b; - - encodec_lstm lstm; - - struct ggml_tensor * final_conv_w; - struct ggml_tensor * final_conv_b; - - std::vector blocks; -}; - -struct encodec_quant_block { - struct ggml_tensor * inited; - struct ggml_tensor * cluster_size; - struct ggml_tensor * embed; - struct ggml_tensor * embed_avg; -}; - -struct encodec_quantizer { - std::vector blocks; -}; - -struct encodec_decoder_block { - //upsampling layers - struct ggml_tensor * us_conv_w; - struct ggml_tensor * us_conv_b; - - // conv1 - struct ggml_tensor * conv_1_w; - struct ggml_tensor * conv_1_b; - - // conv2 - struct ggml_tensor * conv_2_w; - struct ggml_tensor * conv_2_b; +struct encodec_model; - // shortcut - struct ggml_tensor * conv_sc_w; - struct ggml_tensor * conv_sc_b; -}; - -struct encodec_decoder { - struct ggml_tensor * init_conv_w; - struct ggml_tensor * init_conv_b; +struct encodec_context { + encodec_context(encodec_model & model) : model(model) {} - encodec_lstm lstm; + ~encodec_context() { + if (model_owner) { + delete &model; + } + } - struct ggml_tensor * final_conv_w; - struct ggml_tensor * final_conv_b; + encodec_model & model; + bool model_owner = false; - std::vector blocks; -}; + struct ggml_context * ctx_audio; + struct ggml_tensor * reconstructed_audio; -struct encodec_model { - encodec_hparams hparams; + // buffer for `ggml_graph_plan.work_data` + std::vector work_buffer; - encodec_encoder encoder; - encodec_quantizer quantizer; - encodec_decoder decoder; + // buffers to evaluate the model + std::vector buf_alloc; + std::vector buf_compute; - // context - struct ggml_context * ctx; - int n_loaded; + struct ggml_allocr * allocr = {}; - std::map tensors; + // statistics + int64_t t_compute_ms = 0; };