Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: encodec forward pass #9

Merged
merged 4 commits into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ endif()

target_link_libraries(${ENCODEC_LIB} PUBLIC ggml)
target_include_directories(${ENCODEC_LIB} PUBLIC .)
target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11)
target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_14)
33 changes: 25 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,23 @@
- Name (char[name_length])
- Data (float[n_dims])

NOTE
Note
----
Encodec uses weight normalization for its convolutional layers. All the weights are
decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple
call to the hook torch._weight_norm allows to get the final weight tensor of the
convolution from weight_v and weight_g. To drastically reduce the number of operations
at inference time, the ggml weights file only contain the final convolution weights but
does not store the decomposition into weight_v and weight_g.

Usage
-----

```bash
python convert.py \
--dir-model ./ggml_weights/ \
--out-dir ./ggml_weights/
```
"""
import argparse
from pathlib import Path
Expand All @@ -32,14 +41,23 @@
parser.add_argument("--out-dir", type=str, required=True)


def parse_model(checkpoint, outfile):
def parse_codec_model(checkpoint, out_dir):
"""Load encodec model checkpoint."""
outfile = open(out_dir, "wb")
outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic

for name in checkpoint.keys():
if "weight_g" in name:
# the tensor has already been parsed with the corresponding "weight_v"
# tensor to form the final weights tensor of the convolution, therefore
# we skip it
continue

if "inited" in name or "cluster_size" in name or "embed_avg" in name:
# "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
# for the forward pass
continue

var_data = checkpoint[name]

if not "weight_v" in name:
Expand All @@ -49,7 +67,7 @@ def parse_model(checkpoint, outfile):
# weight_v has its corresponding magnitude tensor to rescale the weights
# of the convolutional layers. We parse both kinds of weights jointly to
# build the final weight tensor of the convolution.
base_name = name.split(".")[:-1]
base_name = name.split(".")[:-1]
weight_g_name = ".".join(base_name + ["weight_g"])
var_data_g = checkpoint[weight_g_name]

Expand All @@ -75,6 +93,8 @@ def parse_model(checkpoint, outfile):

var_data.tofile(outfile)

outfile.close()


if __name__ == "__main__":
args = parser.parse_args()
Expand All @@ -84,12 +104,9 @@ def parse_model(checkpoint, outfile):
out_dir = Path(args.out_dir)
out_dir.mkdir(exist_ok=True, parents=True)

outfile = open(out_dir / "ggml-model.bin", "wb")
outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic
outfile = Path(out_dir / "ggml-model.bin")

checkpoint = torch.load(dir_model / "encodec_24khz-d7cc33bc.th", map_location="cpu")
parse_model(checkpoint, outfile)

outfile.close()
parse_codec_model(checkpoint, outfile)

print("Done.")
63 changes: 34 additions & 29 deletions encodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <stdexcept>
#include <fstream>
#include <map>
#include <memory>
#include <string>
#include <vector>

Expand Down Expand Up @@ -139,7 +140,8 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
struct ggml_tensor * weight_ih,
struct ggml_tensor * weight_hh,
struct ggml_tensor * bias_ih,
struct ggml_tensor * bias_hh) {
struct ggml_tensor * bias_hh,
bool is_measure) {

const int input_dim = inp->ne[1];
const int hidden_dim = weight_ih->ne[1]/4;
Expand All @@ -150,8 +152,10 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(
struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);

h_t = ggml_set_zero(h_t);
c_t = ggml_set_zero(c_t);
if (is_measure) {
h_t = ggml_set_zero(h_t);
c_t = ggml_set_zero(c_t);
}

struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));

Expand All @@ -168,7 +172,7 @@ static struct ggml_tensor * forward_pass_lstm_unilayer(

struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim));
struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim));
struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim));

c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t));
Expand Down Expand Up @@ -207,7 +211,7 @@ static struct ggml_tensor * strided_conv_transpose_1d(
return unpadded;
}

bool encodec_model_load(const std::string& fname, encodec_model& model) {
bool encodec_load_model_weights(const std::string& fname, encodec_model& model) {
fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());

auto infile = std::ifstream(fname, std::ios::binary);
Expand Down Expand Up @@ -459,15 +463,9 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {
model.quantizer.blocks.resize(n_q);

for (int i = 0; i < n_q; i++) {
model.quantizer.blocks[i].inited = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
model.quantizer.blocks[i].cluster_size = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_bins);
model.quantizer.blocks[i].embed = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);
model.quantizer.blocks[i].embed_avg = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);

model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.inited"] = model.quantizer.blocks[i].inited;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.cluster_size"] = model.quantizer.blocks[i].cluster_size;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"] = model.quantizer.blocks[i].embed;
model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed_avg"] = model.quantizer.blocks[i].embed_avg;
}
}

Expand Down Expand Up @@ -529,7 +527,7 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {

infile.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

// printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);

total_size += ggml_nbytes(tensor);
model.n_loaded++;
Expand All @@ -548,7 +546,7 @@ static struct ggml_cgraph * encodec_build_graph(
const std::vector<float> & inp_audio) {
const int32_t audio_length = inp_audio.size();

const auto & model = ectx.model;
const auto & model = *ectx.model;

struct ggml_init_params ggml_params = {
/*.mem_size =*/ ectx.buf_compute.size(),
Expand Down Expand Up @@ -617,11 +615,13 @@ static struct ggml_cgraph * encodec_build_graph(

// first lstm layer
struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
ggml_allocr_is_measure(ectx.allocr));

// second lstm layer
struct ggml_tensor * out = forward_pass_lstm_unilayer(
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
ggml_allocr_is_measure(ectx.allocr));

inpL = ggml_add(ctx0, inpL, out);
}
Expand Down Expand Up @@ -723,7 +723,8 @@ static struct ggml_cgraph * encodec_build_graph(
const int stride = hparams.stride;

struct ggml_tensor * inpL = strided_conv_1d(
ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride);
ctx0, quantized_out, model.decoder.init_conv_w,
model.decoder.init_conv_b, stride);

// lstm
{
Expand All @@ -733,11 +734,13 @@ static struct ggml_cgraph * encodec_build_graph(

// first lstm layer
struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b,
ggml_allocr_is_measure(ectx.allocr));

// second lstm layer
struct ggml_tensor * out = forward_pass_lstm_unilayer(
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b,
ggml_allocr_is_measure(ectx.allocr));

inpL = ggml_add(ctx0, inpL, out);
}
Expand Down Expand Up @@ -794,7 +797,7 @@ static struct ggml_cgraph * encodec_build_graph(
return gf;
}

bool encodec_model_eval(
bool encodec_reconstruct_audio(
encodec_context & ectx,
std::vector<float> & raw_audio,
int n_threads) {
Expand Down Expand Up @@ -855,18 +858,20 @@ bool encodec_model_eval(
return true;
}

struct encodec_context encodec_new_context_with_model(encodec_model & model) {
encodec_context ctx = encodec_context(model);
return ctx;
}
std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path) {
int64_t t_start_load_us = ggml_time_us();

encodec_context ectx;

struct encodec_model encodec_load_model_from_file(std::string fname) {
encodec_model model;
if (!encodec_model_load(fname, model)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
exit(0);
ectx.model = std::make_unique<encodec_model>();
if (!encodec_load_model_weights(model_path, *ectx.model)) {
fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str());
return {};
}
return model;

ectx.t_load_us = ggml_time_us() - t_start_load_us;

return std::make_unique<encodec_context>(std::move(ectx));
}

void encodec_free(encodec_context & ectx) {
Expand Down
22 changes: 4 additions & 18 deletions encodec.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ struct encodec_encoder {
};

struct encodec_quant_block {
struct ggml_tensor * inited;
struct ggml_tensor * cluster_size;
struct ggml_tensor * embed;
struct ggml_tensor * embed_avg;
};

struct encodec_quantizer {
Expand Down Expand Up @@ -134,16 +131,7 @@ struct encodec_model {
};

struct encodec_context {
encodec_context(encodec_model & model) : model(model) {}

~encodec_context() {
if (model_owner) {
delete &model;
}
}

encodec_model & model;
bool model_owner = false;
std::unique_ptr<encodec_model> model;

struct ggml_context * ctx_audio;
struct ggml_tensor * reconstructed_audio;
Expand All @@ -158,15 +146,13 @@ struct encodec_context {
struct ggml_allocr * allocr = {};

// statistics
int64_t t_load_us = 0;
int64_t t_compute_ms = 0;
};

std::shared_ptr<encodec_context> encodec_load_model(const std::string & model_path);

struct encodec_model encodec_load_model_from_file(std::string fname);

struct encodec_context encodec_new_context_with_model(encodec_model & model);

bool encodec_model_eval(
bool encodec_reconstruct_audio(
encodec_context & ectx,
std::vector<float> & raw_audio,
int n_threads);
Expand Down
2 changes: 1 addition & 1 deletion examples/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ add_executable(${TARGET} main.cpp dr_wav.h)

install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE encodec.cpp ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_14)

if(MSVC)
target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1)
Expand Down
Loading