feat: sync llama.cpp

mybigday · Jul 23, 2024 · 0b4fc09 · 0b4fc09
1 parent 909da8e
commit 0b4fc09
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 27 deletions.
diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c
@@ -4748,7 +4748,7 @@ void lm_ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void
 
         int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
 
-        sumf += (LM_GGML_FP16_TO_FP32(x[i].d)*LM_GGML_FP16_TO_FP32(y[i].d)) * sumi;
+        sumf += (LM_GGML_FP16_TO_FP32(x[ib].d)*LM_GGML_FP16_TO_FP32(y[ib].d)) * sumi;
     }
 
 #elif defined(__POWER9_VECTOR__)

diff --git a/cpp/ggml.c b/cpp/ggml.c
@@ -21015,7 +21015,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
             lm_gguf_tensor_info_sanitize(info);
 
             // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i; ++j) {
+            for (uint64_t j = 0; j < i && ok; ++j) {
                 if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                     fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                     ok = false;

diff --git a/cpp/llama.cpp b/cpp/llama.cpp
@@ -3718,7 +3718,7 @@ struct llama_model_loader {
         }
 
         if (param_overrides_p != nullptr) {
-            for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
+            for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
                 kv_overrides.insert({std::string(p->key), *p});
             }
         }
@@ -3886,7 +3886,7 @@ struct llama_model_loader {
             ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
 
             {
-                const int kid = lm_gguf_find_key(meta, "general.file_type");
+                const int kid = lm_gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
                 if (kid >= 0) {
                     ftype = (llama_ftype) lm_gguf_get_val_u32(meta, kid);
                 }
@@ -5018,7 +5018,7 @@ static void llm_load_hparams(
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
-                    case 42: model.type = e_model::MODEL_SMALL; break;
+                    case 42: model.type = e_model::MODEL_7B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
@@ -5380,6 +5380,7 @@ static void llm_load_vocab(
             if (merges_keyidx == -1) {
                 throw std::runtime_error("cannot find tokenizer merges in model file\n");
             }
+
             const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx);
             for (int i = 0; i < n_merges; i++) {
                 const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i);
@@ -5418,16 +5419,6 @@ static void llm_load_vocab(
             vocab.special_cls_id  = -1;
             vocab.special_mask_id = -1;
 
-            const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-            if (add_space_prefix_keyidx != -1) {
-                vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
-            } // The default value of add_space_prefix is true.
-
-            const int remove_extra_whitespaces_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
-            if (remove_extra_whitespaces_keyidx != -1) {
-                vocab.tokenizer_remove_extra_whitespaces = lm_gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
-            } // The default value of remove_extra_whitespaces is false.
-
             const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
                 size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
@@ -5535,6 +5526,19 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "jais") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
+            } else if (
+                tokenizer_pre == "tekken") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+                vocab.tokenizer_clean_spaces = false;
+                vocab.tokenizer_ignore_merges = true;
+                vocab.tokenizer_add_bos = true;
+            } else if (
+                tokenizer_pre == "smollm") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+                vocab.tokenizer_clean_spaces = false;
+            } else if (
+                tokenizer_pre == "codeshell") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -5558,10 +5562,8 @@ static void llm_load_vocab(
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
         }
 
-        const int add_space_prefix_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
-        if (add_space_prefix_keyidx != -1) {
-            vocab.tokenizer_add_space_prefix = lm_gguf_get_val_bool(ctx, add_space_prefix_keyidx);
-        }
+        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      vocab.tokenizer_add_space_prefix,         false);
+        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
     }
 
     const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
@@ -6142,10 +6144,10 @@ static bool llm_load_tensors(
 
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
-                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
 
                         // optional bias tensors
                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -15559,6 +15561,8 @@ struct llm_tokenizer_bpe {
             case LLAMA_VOCAB_PRE_TYPE_STARCODER:
             case LLAMA_VOCAB_PRE_TYPE_REFACT:
             case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
+            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
                 regex_exprs = {
                     "\\p{N}",
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -15596,6 +15600,13 @@ struct llm_tokenizer_bpe {
                     "\\p{N}",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
+                    // original regex from tokenizer.json
+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                regex_exprs = {
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -18286,8 +18297,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     // copy the KV pairs from the input file
     lm_gguf_set_kv     (ctx_out, ml.meta);
-    lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION);
-    lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+    lm_gguf_set_val_u32(ctx_out, "general.quantization_version", LM_GGML_QNT_VERSION); // TODO: use LLM_KV
+    lm_gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
+
     // Remove split metadata
     lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
     lm_gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
@@ -19450,7 +19462,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_BAICHUAN:
         case LLM_ARCH_STARCODER:
         case LLM_ARCH_PLAMO:
-        case LLM_ARCH_CODESHELL:
         case LLM_ARCH_ORION:
         case LLM_ARCH_INTERNLM2:
         case LLM_ARCH_MINICPM:
@@ -19480,6 +19491,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_STARCODER2:
         case LLM_ARCH_OPENELM:
         case LLM_ARCH_GPTNEOX:
+        case LLM_ARCH_CODESHELL:
             return LLAMA_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here

diff --git a/cpp/llama.h b/cpp/llama.h
@@ -92,6 +92,9 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
+        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
+        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
     };
 
     // note: these values should be synchronized with lm_ggml_rope

diff --git a/llama.cpp b/llama.cpp
+1 −1		.github/workflows/build.yml
+54 −33		convert_hf_to_gguf.py
+9 −6		convert_hf_to_gguf_update.py
+5 −1		convert_llama_ggml_to_gguf.py
+18 −8		convert_lora_to_gguf.py
+5 −0		examples/gguf/gguf.cpp
+1 −1		examples/llama.android/llama/src/main/cpp/llama-android.cpp
+3 −1		examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+1 −1		examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+222 −137		examples/pydantic_models_to_grammar_examples.py
+1 −1		examples/server/README.md
+3 −3		flake.lock
+24 −0		ggml/src/ggml-cuda/mmq.cu
+731 −646		ggml/src/ggml-cuda/mmq.cuh
+2 −1		ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu
+5 −0		ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu
+21 −0		ggml/src/ggml-cuda/vecdotq.cuh
+1 −1		ggml/src/ggml-quants.c
+1 −1		ggml/src/ggml.c
+26 −9		gguf-py/gguf/metadata.py
+3 −3		gguf-py/gguf/utility.py
+48 −3		gguf-py/tests/test_metadata.py
+3 −0		include/llama.h
+ −		models/ggml-vocab-gpt2.gguf
+ −		models/ggml-vocab-stablelm.gguf
+36 −24		src/llama.cpp
+14 −18		tests/CMakeLists.txt