Skip to content

Commit

Permalink
feat: sync llama cpp (#34)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* fix: remove memory_f16 param

* chore(ios): update lockfile
  • Loading branch information
jhen0409 authored Dec 12, 2023
1 parent 3f27371 commit 0410481
Show file tree
Hide file tree
Showing 29 changed files with 6,952 additions and 3,163 deletions.
3 changes: 0 additions & 3 deletions android/src/main/java/com/rnllama/LlamaContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ public LlamaContext(int id, ReactApplicationContext reactContext, ReadableMap pa
params.hasKey("use_mlock") ? params.getBoolean("use_mlock") : true,
// boolean use_mmap,
params.hasKey("use_mmap") ? params.getBoolean("use_mmap") : true,
// boolean memory_f16,
params.hasKey("memory_f16") ? params.getBoolean("memory_f16") : true,
// String lora,
params.hasKey("lora") ? params.getString("lora") : "",
// float lora_scaled,
Expand Down Expand Up @@ -285,7 +283,6 @@ protected static native long initContext(
int n_gpu_layers, // TODO: Support this
boolean use_mlock,
boolean use_mmap,
boolean memory_f16,
String lora,
float lora_scaled,
String lora_base,
Expand Down
3 changes: 0 additions & 3 deletions android/src/main/jni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ Java_com_rnllama_LlamaContext_initContext(
jint n_gpu_layers, // TODO: Support this
jboolean use_mlock,
jboolean use_mmap,
jboolean memory_f16,
jstring lora_str,
jfloat lora_scaled,
jstring lora_base_str,
Expand Down Expand Up @@ -158,8 +157,6 @@ Java_com_rnllama_LlamaContext_initContext(
defaultParams.use_mlock = use_mlock;
defaultParams.use_mmap = use_mmap;

defaultParams.memory_f16 = memory_f16;

const char *lora_chars = env->GetStringUTFChars(lora_str, nullptr);
const char *lora_base_chars = env->GetStringUTFChars(lora_base_str, nullptr);
if (lora_chars) {
Expand Down
247 changes: 240 additions & 7 deletions cpp/common.cpp

Large diffs are not rendered by default.

31 changes: 29 additions & 2 deletions cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ struct gpt_params {
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files

std::vector<llama_model_kv_override> kv_overrides;

// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
Expand All @@ -98,10 +100,10 @@ struct gpt_params {
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score

bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

Expand All @@ -121,10 +123,15 @@ struct gpt_params {
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool infill = false; // use infill mode
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
bool no_kv_offload = false; // disable KV offloading

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
std::string image = ""; // path to an image file
};

bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
Expand All @@ -139,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);

void process_escapes(std::string& input);

//
// String parsing
//

std::string parse_samplers_input(std::string input);

//
// Model utils
//
Expand Down Expand Up @@ -200,6 +213,10 @@ std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);

// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);

//
// YAML utils
//
Expand All @@ -213,3 +230,13 @@ std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

//
// KV cache utils
//

// Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
Loading

0 comments on commit 0410481

Please sign in to comment.