Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #89

Merged
merged 1 commit into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@ set(
${RNLLAMA_LIB_DIR}/ggml-aarch64.c
${RNLLAMA_LIB_DIR}/ggml-alloc.c
${RNLLAMA_LIB_DIR}/ggml-backend.cpp
${RNLLAMA_LIB_DIR}/ggml-backend-reg.cpp
${RNLLAMA_LIB_DIR}/ggml.c
${RNLLAMA_LIB_DIR}/ggml-cpu.c
${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.c
${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
${RNLLAMA_LIB_DIR}/ggml-threading.cpp
${RNLLAMA_LIB_DIR}/ggml-quants.c
${RNLLAMA_LIB_DIR}/common.cpp
${RNLLAMA_LIB_DIR}/json.hpp
Expand Down Expand Up @@ -77,7 +83,7 @@ if (${ANDROID_ABI} STREQUAL "arm64-v8a")

# https://github.com/ggerganov/llama.cpp/blob/master/docs/android.md#cross-compile-using-android-ndk
# llama.cpp will deal with the cpu features
# build_library("rnllama_v8_7" "-march=armv8.7-a")
# build_library("rnllama_v8_7" "-march=armv8.7-a")
# TODO: Add support runtime check for cpu features
# At the moment runtime check is failing.

Expand Down
216 changes: 3 additions & 213 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,9 @@ static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
if (s == "f16") {
return LM_GGML_TYPE_F16;
}
if (s == "bf16") {
return LM_GGML_TYPE_BF16;
}
if (s == "q8_0") {
return LM_GGML_TYPE_Q8_0;
}
Expand Down Expand Up @@ -1898,216 +1901,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}

//
// YAML utils
//

void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
if (data.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

fprintf(stream, "%s: [", prop_name);
for (size_t i = 0; i < data.size() - 1; ++i) {
fprintf(stream, "%e, ", data[i]);
}
fprintf(stream, "%e]\n", data.back());
}

void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
if (data.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

fprintf(stream, "%s: [", prop_name);
for (size_t i = 0; i < data.size() - 1; ++i) {
fprintf(stream, "%d, ", data[i]);
}
fprintf(stream, "%d]\n", data.back());
}

void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
std::string data_str(data == NULL ? "" : data);

if (data_str.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

size_t pos_start = 0;
size_t pos_found = 0;

if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
data_str = "\"" + data_str + "\"";
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
return;
}

if (data_str.find('\n') == std::string::npos) {
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
return;
}

fprintf(stream, "%s: |\n", prop_name);
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
pos_start = pos_found + 1;
}
}

void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const auto & sparams = params.sparams;

fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", lm_ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cuda: %s\n", lm_ggml_cpu_has_cuda() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", lm_ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", lm_ggml_cpu_has_kompute() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", lm_ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", lm_ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", lm_ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_sve: %s\n", lm_ggml_cpu_has_sve() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", lm_ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", lm_ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_riscv_v: %s\n", lm_ggml_cpu_has_riscv_v() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", lm_ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", lm_ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", lm_ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", lm_ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_matmul_int8: %s\n", lm_ggml_cpu_has_matmul_int8() ? "true" : "false");

#ifdef NDEBUG
fprintf(stream, "debug: false\n");
#else
fprintf(stream, "debug: true\n");
#endif // NDEBUG

fprintf(stream, "model_desc: %s\n", model_desc);
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));

#ifdef __OPTIMIZE__
fprintf(stream, "optimize: true\n");
#else
fprintf(stream, "optimize: false\n");
#endif // __OPTIMIZE__

fprintf(stream, "time: %s\n", timestamp.c_str());

fprintf(stream, "\n");
fprintf(stream, "###############\n");
fprintf(stream, "# User Inputs #\n");
fprintf(stream, "###############\n");
fprintf(stream, "\n");

fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");

yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

fprintf(stream, "logit_bias:\n");
for (const auto & logit_bias : sparams.logit_bias) {
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
}

fprintf(stream, "lora:\n");
for (auto & la : params.lora_adapters) {
if (la.scale == 1.0f) {
fprintf(stream, " - %s\n", la.path.c_str());
}
}
fprintf(stream, "lora_scaled:\n");
for (auto & la : params.lora_adapters) {
if (la.scale != 1.0f) {
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
}
}
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);

fprintf(stream, "reverse_prompt:\n");
for (std::string ap : params.antiprompt) {
size_t pos = 0;
while ((pos = ap.find('\n', pos)) != std::string::npos) {
ap.replace(pos, 1, "\\n");
pos += 1;
}

fprintf(stream, " - %s\n", ap.c_str());
}

fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);

fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
}
17 changes: 2 additions & 15 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ struct common_sampler_params {
struct common_params {
bool vocab_only = false;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_ctx = 4096; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
Expand All @@ -190,7 +190,7 @@ struct common_params {
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
float defrag_thold = 0.1f; // KV cache defragmentation threshold

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
Expand Down Expand Up @@ -221,7 +221,6 @@ struct common_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
Expand Down Expand Up @@ -599,15 +598,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

//
// YAML utils
//

void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);

void yaml_dump_non_result_info(
FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
Loading
Loading