From 455d83fc2bb7ca0ace18f5691b2300cb4444c810 Mon Sep 17 00:00:00 2001
From: Jhen <developer@jhen.me>
Date: Fri, 27 Oct 2023 12:54:49 +0800
Subject: [PATCH] feat: sync llama.cpp

---
 android/src/main/jni.cpp   |   2 +-
 cpp/build-info.h           |   4 +-
 cpp/common.cpp             |   8 +-
 cpp/ggml-metal-llama.metal |  10 +-
 cpp/ggml-metal.m           |  18 +-
 cpp/ggml.c                 | 446 ++++++++++++++++++++++++++++++-------
 cpp/ggml.h                 |  15 +-
 cpp/llama.cpp              |  47 ++--
 cpp/llama.h                |  21 +-
 cpp/log.h                  |  35 +--
 cpp/rn-llama.hpp           |   4 +-
 cpp/sampling.cpp           |   4 +-
 ios/RNLlamaContext.mm      |   2 +-
 llama.cpp                  |   2 +-
 scripts/ggml-metal.m.patch |   6 +-
 scripts/log.h.patch        |   8 +-
 16 files changed, 465 insertions(+), 167 deletions(-)
diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
index fa71c8e..45ecc65 100644
--- a/android/src/main/jni.cpp
+++ b/android/src/main/jni.cpp
@@ -328,7 +328,7 @@ Java_com_rnllama_LlamaContext_doCompletion(
 
     sparams.logit_bias.clear();
     if (ignore_eos) {
-        sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY;
     }
 
     const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx));
diff --git a/cpp/build-info.h b/cpp/build-info.h
index 832eba8..e36ff30 100644
--- a/cpp/build-info.h
+++ b/cpp/build-info.h
@@ -1,8 +1,8 @@
 #ifndef BUILD_INFO_H
 #define BUILD_INFO_H
 
-#define BUILD_NUMBER 1414
-#define BUILD_COMMIT "96981f3"
+#define BUILD_NUMBER 1429
+#define BUILD_COMMIT "34b2a5e"
 #define BUILD_COMPILER ""
 #define BUILD_TARGET "unknown"
 
diff --git a/cpp/common.cpp b/cpp/common.cpp
index a975e41..131673f 100644
--- a/cpp/common.cpp
+++ b/cpp/common.cpp
@@ -880,13 +880,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     }
 
     if (params.ignore_eos) {
-        params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
     }
 
     {
         LOG("warming up the model with an empty run\n");
 
-        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
         llama_kv_cache_tokens_rm(lctx, -1, -1);
         llama_reset_timings(lctx);
@@ -941,7 +941,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
 }
 
 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
-    const llama_token bos_id = llama_token_bos(ctx);
+    const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
 
     std::string piece;
     std::string result;
@@ -1186,7 +1186,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
     fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
 
-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
     const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
     fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
 
diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal
index 69fc713..f4b4605 100644
--- a/cpp/ggml-metal-llama.metal
+++ b/cpp/ggml-metal-llama.metal
@@ -125,9 +125,17 @@ kernel void kernel_mul_row(
 }
 
 kernel void kernel_scale(
+        device const float * src0,
+        device       float * dst,
+        constant     float & scale,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = src0[tpig] * scale;
+}
+
+kernel void kernel_scale_4(
         device const float4 * src0,
         device       float4 * dst,
-        constant     float & scale,
+        constant     float  & scale,
         uint tpig[[thread_position_in_grid]]) {
     dst[tpig] = src0[tpig] * scale;
 }
diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m
index ba7e743..c858811 100644
--- a/cpp/ggml-metal.m
+++ b/cpp/ggml-metal.m
@@ -62,6 +62,7 @@
     LM_GGML_METAL_DECL_KERNEL(mul);
     LM_GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
     LM_GGML_METAL_DECL_KERNEL(scale);
+    LM_GGML_METAL_DECL_KERNEL(scale_4);
     LM_GGML_METAL_DECL_KERNEL(silu);
     LM_GGML_METAL_DECL_KERNEL(relu);
     LM_GGML_METAL_DECL_KERNEL(gelu);
@@ -249,6 +250,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format,
         LM_GGML_METAL_ADD_KERNEL(mul);
         LM_GGML_METAL_ADD_KERNEL(mul_row);
         LM_GGML_METAL_ADD_KERNEL(scale);
+        LM_GGML_METAL_ADD_KERNEL(scale_4);
         LM_GGML_METAL_ADD_KERNEL(silu);
         LM_GGML_METAL_ADD_KERNEL(relu);
         LM_GGML_METAL_ADD_KERNEL(gelu);
@@ -347,6 +349,7 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
     LM_GGML_METAL_DEL_KERNEL(mul);
     LM_GGML_METAL_DEL_KERNEL(mul_row);
     LM_GGML_METAL_DEL_KERNEL(scale);
+    LM_GGML_METAL_DEL_KERNEL(scale_4);
     LM_GGML_METAL_DEL_KERNEL(silu);
     LM_GGML_METAL_DEL_KERNEL(relu);
     LM_GGML_METAL_DEL_KERNEL(gelu);
@@ -923,15 +926,20 @@ void lm_ggml_metal_graph_compute(
 
                             const float scale = *(const float *) src1->data;
 
-                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            int64_t n = lm_ggml_nelements(dst);
+
+                            if (n % 4 == 0) {
+                                n /= 4;
+                                [encoder setComputePipelineState:ctx->pipeline_scale_4];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_scale];
+                            }
+
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                             [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
 
-                            const int64_t n = lm_ggml_nelements(dst);
-                            LM_GGML_ASSERT(n % 4 == 0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
                     case LM_GGML_OP_UNARY:
                         switch (lm_ggml_get_unary_op(gf->nodes[i])) {
diff --git a/cpp/ggml.c b/cpp/ggml.c
index d822a32..525ae1c 100644
--- a/cpp/ggml.c
+++ b/cpp/ggml.c
@@ -571,7 +571,6 @@ int64_t lm_ggml_cycles_per_ms(void) {
 #define lm_ggml_perf_cycles_per_ms() 0
 #endif
 
-
 //
 // cache line
 //
@@ -1828,7 +1827,6 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) {
     return type_traits[type];
 }
 
-
 //
 // simd mappings
 //
@@ -4057,16 +4055,17 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "ALIBI",
     "CLAMP",
     "CONV_1D",
+    "CONV_1D_STAGE_0",
+    "CONV_1D_STAGE_1",
     "CONV_TRANSPOSE_1D",
     "CONV_2D",
+    "CONV_2D_STAGE_0",
+    "CONV_2D_STAGE_1",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
 
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
-
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
@@ -4092,7 +4091,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71");
+static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73");
 
 static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "none",
@@ -4143,16 +4142,17 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "alibi(x)",
     "clamp(x)",
     "conv_1d(x)",
+    "conv_1d_stage_0(x)",
+    "conv_1d_stage_1(x)",
     "conv_transpose_1d(x)",
     "conv_2d(x)",
+    "conv_2d_stage_0(x)",
+    "conv_2d_stage_1(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
 
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
-
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
@@ -4178,7 +4178,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71");
+static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73");
 
 static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
 
@@ -4209,8 +4209,10 @@ static void lm_ggml_setup_op_has_task_pass(void) {
         p[LM_GGML_OP_CONV_1D                ] = true;
         p[LM_GGML_OP_CONV_1D_STAGE_0        ] = true;
         p[LM_GGML_OP_CONV_1D_STAGE_1        ] = true;
-        p[LM_GGML_OP_CONV_2D                ] = true;
         p[LM_GGML_OP_CONV_TRANSPOSE_1D      ] = true;
+        p[LM_GGML_OP_CONV_2D                ] = true;
+        p[LM_GGML_OP_CONV_2D_STAGE_0        ] = true;
+        p[LM_GGML_OP_CONV_2D_STAGE_1        ] = true;
         p[LM_GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[LM_GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[LM_GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -5954,7 +5956,6 @@ struct lm_ggml_tensor * lm_ggml_sqrt_inplace(
     return lm_ggml_sqrt_impl(ctx, a, true);
 }
 
-
 // lm_ggml_log
 
 static struct lm_ggml_tensor * lm_ggml_log_impl(
@@ -6008,7 +6009,6 @@ struct lm_ggml_tensor * lm_ggml_sum(
     return result;
 }
 
-
 // lm_ggml_sum_rows
 
 struct lm_ggml_tensor * lm_ggml_sum_rows(
@@ -6640,7 +6640,6 @@ struct lm_ggml_tensor * lm_ggml_set_2d_inplace(
     return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
 }
 
-
 // lm_ggml_cpy
 
 static struct lm_ggml_tensor * lm_ggml_cpy_impl(
@@ -6720,7 +6719,6 @@ struct lm_ggml_tensor * lm_ggml_cont_inplace(
     return lm_ggml_cont_impl(ctx, a, true);
 }
 
-
 // make contiguous, with new shape
 LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_1d(
         struct lm_ggml_context * ctx,
@@ -7173,7 +7171,6 @@ struct lm_ggml_tensor * lm_ggml_diag(
     return result;
 }
 
-
 // lm_ggml_diag_mask_inf
 
 static struct lm_ggml_tensor * lm_ggml_diag_mask_inf_impl(
@@ -7285,7 +7282,6 @@ struct lm_ggml_tensor * lm_ggml_soft_max_inplace(
     return lm_ggml_soft_max_impl(ctx, a, true);
 }
 
-
 // lm_ggml_soft_max_back
 
 static struct lm_ggml_tensor * lm_ggml_soft_max_back_impl(
@@ -7702,7 +7698,11 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d(
 
 // lm_ggml_conv_2d
 
-struct lm_ggml_tensor * lm_ggml_conv_2d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0(
     struct lm_ggml_context * ctx,
     struct lm_ggml_tensor  * a,
     struct lm_ggml_tensor  * b,
@@ -7721,17 +7721,21 @@ struct lm_ggml_tensor * lm_ggml_conv_2d(
         is_node = true;
     }
 
+    const int64_t OH = lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
+    const int64_t OW = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
     const int64_t ne[4] = {
-        lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-        lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
-        a->ne[3], b->ne[3],
+        a->ne[2] * a->ne[1] * a->ne[0],
+        OW,
+        OH,
+        b->ne[3],
     };
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne);
 
     int32_t params[] = { s0, s1, p0, p1, d0, d1 };
     lm_ggml_set_op_params(result, params, sizeof(params));
 
-    result->op = LM_GGML_OP_CONV_2D;
+    result->op = LM_GGML_OP_CONV_2D_STAGE_0;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
@@ -7740,8 +7744,61 @@ struct lm_ggml_tensor * lm_ggml_conv_2d(
 
 }
 
-// lm_ggml_conv_2d_sk_p0
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// a: [OC, IC, KH, KW]
+// b: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_1(
+    struct lm_ggml_context * ctx,
+    struct lm_ggml_tensor  * a,
+    struct lm_ggml_tensor  * b) {
+
+    bool is_node = false;
+
+    if (a->grad || b->grad) {
+        LM_GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    const int64_t ne[4] = {
+        b->ne[1],
+        b->ne[2],
+        a->ne[3],
+        b->ne[3],
+    };
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
+
+    result->op = LM_GGML_OP_CONV_2D_STAGE_1;
+    result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+
+}
+
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct lm_ggml_tensor * lm_ggml_conv_2d(
+    struct lm_ggml_context * ctx,
+    struct lm_ggml_tensor  * a,
+    struct lm_ggml_tensor  * b,
+    int                  s0,
+    int                  s1,
+    int                  p0,
+    int                  p1,
+    int                  d0,
+    int                  d1) {
 
+    struct lm_ggml_tensor * result = lm_ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
+    result = lm_ggml_conv_2d_stage_1(ctx, a, result);
+
+    return result;
+
+}
+
+// lm_ggml_conv_2d_sk_p0
 struct lm_ggml_tensor * lm_ggml_conv_2d_sk_p0(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
@@ -8180,7 +8237,6 @@ static struct lm_ggml_tensor * lm_ggml_add_rel_pos_impl(
     return result;
 }
 
-
 struct lm_ggml_tensor * lm_ggml_add_rel_pos(
         struct lm_ggml_context * ctx,
         struct lm_ggml_tensor  * a,
@@ -8625,8 +8681,6 @@ struct lm_ggml_tensor * lm_ggml_map_custom3_inplace(
     return lm_ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }
 
-
-
 // lm_ggml_cross_entropy_loss
 
 struct lm_ggml_tensor * lm_ggml_cross_entropy_loss(
@@ -9828,7 +9882,6 @@ static void lm_ggml_compute_forward_add1(
     }
 }
 
-
 // lm_ggml_compute_forward_acc
 
 static void lm_ggml_compute_forward_acc_f32(
@@ -9968,7 +10021,6 @@ static void lm_ggml_compute_forward_sub_f32(
             const int i2 = (ir - i3*ne2*ne1)/ne1;
             const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
-
 #ifdef LM_GGML_USE_ACCELERATE
             vDSP_vsub(
                     (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
@@ -10149,7 +10201,6 @@ static void lm_ggml_compute_forward_div_f32(
             const int i2 = (ir - i3*ne2*ne1)/ne1;
             const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
 
-
 #ifdef LM_GGML_USE_ACCELERATE
             UNUSED(lm_ggml_vec_div_f32);
 
@@ -10287,7 +10338,6 @@ static void lm_ggml_compute_forward_sqrt(
     }
 }
 
-
 // lm_ggml_compute_forward_log
 
 static void lm_ggml_compute_forward_log_f32(
@@ -12120,7 +12170,6 @@ static void lm_ggml_compute_forward_out_prod_f32(
         }
     }
 
-
     //int64_t t1 = lm_ggml_perf_time_us();
     //static int64_t acc = 0;
     //acc += t1 - t0;
@@ -12316,7 +12365,6 @@ static void lm_ggml_compute_forward_scale_f32(
 
     const size_t nb1 = dst->nb[1];
 
-
     for (int i1 = ir0; i1 < ir1; i1++) {
         if (dst->data != src0->data) {
             // src0 is same shape as dst => same indices
@@ -12714,7 +12762,6 @@ static void lm_ggml_compute_forward_get_rows_back_f32(
     }
 }
 
-
 static void lm_ggml_compute_forward_get_rows_back(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
@@ -13997,6 +14044,7 @@ static void lm_ggml_compute_forward_conv_1d_f32(
     }
 }
 
+// TODO: reuse lm_ggml_mul_mat or implement lm_ggml_im2col and remove stage_0 and stage_1
 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
                              lm_ggml_fp16_t * A,
                              lm_ggml_fp16_t * B,
@@ -14298,6 +14346,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32(
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+
         return;
     }
 
@@ -14370,7 +14421,7 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32(
                     const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
                     float * dst_data = wdata + i01*ne00*ne02;
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
+                        dst_data[i00*ne02 + i02] = src[i00];
                     }
                 }
             }
@@ -14389,6 +14440,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32(
             }
         }
 
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+
         return;
     }
 
@@ -14450,28 +14504,190 @@ static void lm_ggml_compute_forward_conv_transpose_1d(
 
 // lm_ggml_compute_forward_conv_2d
 
-static void lm_ggml_compute_forward_conv_2d_f16_f32(
+// src0: kernel [OC, IC, KH, KW]
+// src1: image [N, IC, IH, IW]
+// dst:  result [N, OH, OW, IC*KH*KW]
+static void lm_ggml_compute_forward_conv_2d_stage_0_f32(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
         const struct lm_ggml_tensor * src1,
               struct lm_ggml_tensor * dst) {
     LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
     LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
+    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16);
+
+    int64_t t0 = lm_ggml_perf_time_us();
+    UNUSED(t0);
+
+    LM_GGML_TENSOR_BINARY_OP_LOCALS;
+
+    const int64_t N = ne13;
+    const int64_t IC = ne12;
+    const int64_t IH = ne11;
+    const int64_t IW = ne10;
+
+    // const int64_t OC = ne03;
+    // const int64_t IC = ne02;
+    const int64_t KH = ne01;
+    const int64_t KW = ne00;
+
+    const int64_t OH = ne2;
+    const int64_t OW = ne1;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb10 == sizeof(float));
+
+    if (params->type == LM_GGML_TASK_INIT) {
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+        return;
+    }
+
+    if (params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data;
+
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) {
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic+=nth) {
+
+                        // micro kernel
+                        lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+
+                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+// src0: [OC, IC, KH, KW]
+// src1: [N, OH, OW, IC * KH * KW]
+// result: [N, OC, OH, OW]
+static void lm_ggml_compute_forward_conv_2d_stage_1_f16(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
+    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16);
     LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
 
     int64_t t0 = lm_ggml_perf_time_us();
     UNUSED(t0);
 
+    if (params->type == LM_GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == LM_GGML_TASK_FINALIZE) {
+        return;
+    }
+
     LM_GGML_TENSOR_BINARY_OP_LOCALS;
 
+    LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t));
+    LM_GGML_ASSERT(nb0  == sizeof(float));
+
+    const int N = ne13;
+    const int OH = ne12;
+    const int OW = ne11;
+
+    const int OC = ne03;
+    const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
+
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
+        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m, n]
+
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
+    }
+}
+
+static void lm_ggml_compute_forward_conv_2d_f16_f32(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16);
+    LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
+    LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32);
+
+    int64_t t0 = lm_ggml_perf_time_us();
+    UNUSED(t0);
+
+    LM_GGML_TENSOR_BINARY_OP_LOCALS
+
+    // src1: image [N, IC, IH, IW]
+    // src0: kernel [OC, IC, KH, KW]
+    // dst:  result [N, OC, OH, OW]
+    // ne12: IC
+    // ne0: OW
+    // ne1: OH
+    // nk0: KW
+    // nk1: KH
+    // ne13: N
+
+    const int N = ne13;
+    const int IC = ne12;
+    const int IH = ne11;
+    const int IW = ne10;
+
+    const int OC = ne03;
+    // const int IC = ne02;
+    const int KH = ne01;
+    const int KW = ne00;
+
+    const int OH = ne1;
+    const int OW = ne0;
+
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int nk0 = ne00;
-    const int nk1 = ne01;
+    // const int nk0 = ne00;
+    // const int nk1 = ne01;
 
     // size of the convolution row - the kernel size unrolled across all channels
-    const int ew0 = nk0*nk1*ne02;
+    // const int ew0 = nk0*nk1*ne02;
+    // ew0: IC*KH*KW
 
     const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
     const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@@ -14487,24 +14703,27 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32(
         memset(params->wdata, 0, params->wsize);
 
         // prepare source data (src1)
+        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
+
         {
             lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
 
-            for (int i13 = 0; i13 < ne13; i13++) {
-                for (int i12 = 0; i12 < ne12; i12++) {
-                    const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
-                    lm_ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
-
-                    for (int i1 = 0; i1 < ne1; i1++) {
-                        for (int i0 = 0; i0 < ne0; i0++) {
-                            for (int ik1 = 0; ik1 < nk1; ik1++) {
-                                for (int ik0 = 0; ik0 < nk0; ik0++) {
-                                    const int idx0 = i0*s0 + ik0*d0 - p0;
-                                    const int idx1 = i1*s1 + ik1*d1 - p1;
-
-                                    if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
-                                        dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
-                                            LM_GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
+            for (int in = 0; in < N; in++) {
+                for (int iic = 0; iic < IC; iic++) {
+                    for (int ioh = 0; ioh < OH; ioh++) {
+                        for (int iow = 0; iow < OW; iow++) {
+
+                            // micro kernel
+                            lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
+
+                            for (int ikh = 0; ikh < KH; ikh++) {
+                                for (int ikw = 0; ikw < KW; ikw++) {
+                                    const int iiw = iow*s0 + ikw*d0 - p0;
+                                    const int iih = ioh*s1 + ikh*d1 - p1;
+
+                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                     }
                                 }
                             }
@@ -14521,30 +14740,22 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32(
         return;
     }
 
-    // total patches in dst
-    const int np = ne2;
-
-    // patches per thread
-    const int dp = (np + nth - 1)/nth;
+    lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
+    // wdata: [N*OH*OW, IC*KH*KW]
+    // dst: result [N, OC, OH, OW]
+    // src0: kernel [OC, IC, KH, KW]
 
-    // patch range for this thread
-    const int ip0 = dp*ith;
-    const int ip1 = MIN(ip0 + dp, np);
+    int64_t m = OC;
+    int64_t n = OH * OW;
+    int64_t k = IC * KH * KW;
 
-    lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0;
+    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
+    for (int i = 0; i < N; i++) {
+        lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k]
+        lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)wdata + i * m * k; // [n, k]
+        float * C = (float *)dst->data + i * m * n; // [m * k]
 
-    for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ip0; i2 < ip1; i2++) {
-            float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
-
-            for (int i1 = 0; i1 < ne1; ++i1) {
-                for (int i0 = 0; i0 < ne0; ++i0) {
-                    lm_ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
-                            (lm_ggml_fp16_t *) ((char *) src0->data + i2*nb03),
-                            (lm_ggml_fp16_t *)                wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
-                }
-            }
-        }
+        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
     }
 }
 
@@ -14570,6 +14781,48 @@ static void lm_ggml_compute_forward_conv_2d(
     }
 }
 
+static void lm_ggml_compute_forward_conv_2d_stage_0(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch (src0->type) {
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
+            } break;
+        case LM_GGML_TYPE_F32:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
+static void lm_ggml_compute_forward_conv_2d_stage_1(
+        const struct lm_ggml_compute_params * params,
+        const struct lm_ggml_tensor * src0,
+        const struct lm_ggml_tensor * src1,
+              struct lm_ggml_tensor * dst) {
+    switch (src0->type) {
+        case LM_GGML_TYPE_F16:
+            {
+                lm_ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+            } break;
+        case LM_GGML_TYPE_F32:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+        default:
+            {
+                LM_GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // lm_ggml_compute_forward_conv_transpose_2d
 
 static void lm_ggml_compute_forward_conv_transpose_2d(
@@ -14628,6 +14881,8 @@ static void lm_ggml_compute_forward_conv_transpose_2d(
             }
         }
 
+        memset(dst->data, 0, lm_ggml_nbytes(dst));
+
         return;
     }
 
@@ -16126,7 +16381,6 @@ static void lm_ggml_compute_forward_add_rel_pos_f32(
     const int ip0 = dp*ith;
     const int ip1 = MIN(ip0 + dp, np);
 
-
     for (int64_t i13 = ip0; i13 < ip1; ++i13) {
         for (int64_t i12 = 0; i12 < ne12; ++i12) {
             for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -16193,7 +16447,6 @@ static void lm_ggml_compute_forward_map_unary_f32(
     }
 }
 
-
 static void lm_ggml_compute_forward_map_unary(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
@@ -16241,7 +16494,6 @@ static void lm_ggml_compute_forward_map_binary_f32(
     }
 }
 
-
 static void lm_ggml_compute_forward_map_binary(
         const struct lm_ggml_compute_params * params,
         const struct lm_ggml_tensor * src0,
@@ -16293,7 +16545,6 @@ static void lm_ggml_compute_forward_map_custom2_f32(
     fun(dst, a, b);
 }
 
-
 // lm_ggml_compute_forward_map_custom3
 
 static void lm_ggml_compute_forward_map_custom3_f32(
@@ -16568,7 +16819,6 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32(
         lm_ggml_vec_sub_f32(nc, ds0, ds0, s1);
         lm_ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
 
-
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             assert(!isnan(ds0[i]));
@@ -16596,12 +16846,15 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back(
     }
 }
 
-
 /////////////////////////////////
 
 static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * tensor) {
     LM_GGML_ASSERT(params);
 
+    if (tensor->op == LM_GGML_OP_NONE) {
+        return;
+    }
+
 #ifdef LM_GGML_USE_CUBLAS
     bool skip_cpu = lm_ggml_cuda_compute_forward(params, tensor);
     if (skip_cpu) {
@@ -16804,6 +17057,14 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
             {
                 lm_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
+        case LM_GGML_OP_CONV_2D_STAGE_0:
+            {
+                lm_ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
+        case LM_GGML_OP_CONV_2D_STAGE_1:
+            {
+                lm_ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+            } break;
         case LM_GGML_OP_CONV_TRANSPOSE_2D:
             {
                 lm_ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@@ -17733,11 +17994,19 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case LM_GGML_OP_CONV_TRANSPOSE_1D:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case LM_GGML_OP_CONV_2D:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case LM_GGML_OP_CONV_TRANSPOSE_1D:
+        case LM_GGML_OP_CONV_2D_STAGE_0:
+            {
+                LM_GGML_ASSERT(false); // TODO: not implemented
+            } break;
+        case LM_GGML_OP_CONV_2D_STAGE_1:
             {
                 LM_GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -18666,6 +18935,7 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
                     const int64_t ne0 = node->ne[0];
                     const int64_t ne1 = node->ne[1];
                     const int64_t ne2 = node->ne[2];
+                    const int64_t ne3 = node->ne[3];
                     const int64_t nk = ne00*ne01;
                     const int64_t ew0 = nk * ne02;
 
@@ -18676,7 +18946,8 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
 
                     if (node->src[0]->type == LM_GGML_TYPE_F16 &&
                         node->src[1]->type == LM_GGML_TYPE_F32) {
-                        cur = sizeof(lm_ggml_fp16_t)*(ne0*ne1*ew0);
+                        // im2col: [N*OH*OW, IC*KH*KW]
+                        cur = sizeof(lm_ggml_fp16_t)*(ne3*ne0*ne1*ew0);
                     } else if (node->src[0]->type == LM_GGML_TYPE_F32 &&
                                node->src[1]->type == LM_GGML_TYPE_F32) {
                         cur = sizeof(float)*      (ne10*ne11*ne12);
@@ -18686,6 +18957,14 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th
 
                     work_size = MAX(work_size, cur);
                 } break;
+            case LM_GGML_OP_CONV_2D_STAGE_0:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case LM_GGML_OP_CONV_2D_STAGE_1:
+                {
+                    n_tasks = n_threads;
+                } break;
             case LM_GGML_OP_CONV_TRANSPOSE_2D:
                 {
                     n_tasks = n_threads;
@@ -19874,7 +20153,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam(
 
         opt->loss_after = fx;
 
-
         // check convergence
         if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
             LM_GGML_PRINT_DEBUG("converged\n");
diff --git a/cpp/ggml.h b/cpp/ggml.h
index b252464..0416777 100644
--- a/cpp/ggml.h
+++ b/cpp/ggml.h
@@ -401,15 +401,16 @@ extern "C" {
         LM_GGML_OP_ALIBI,
         LM_GGML_OP_CLAMP,
         LM_GGML_OP_CONV_1D,
-        LM_GGML_OP_CONV_2D,
+        LM_GGML_OP_CONV_1D_STAGE_0,  // internal
+        LM_GGML_OP_CONV_1D_STAGE_1,  // internal
         LM_GGML_OP_CONV_TRANSPOSE_1D,
+        LM_GGML_OP_CONV_2D,
+        LM_GGML_OP_CONV_2D_STAGE_0, // internal
+        LM_GGML_OP_CONV_2D_STAGE_1, // internal
         LM_GGML_OP_CONV_TRANSPOSE_2D,
         LM_GGML_OP_POOL_1D,
         LM_GGML_OP_POOL_2D,
 
-        LM_GGML_OP_CONV_1D_STAGE_0,  // internal
-        LM_GGML_OP_CONV_1D_STAGE_1,  // internal
-
         LM_GGML_OP_UPSCALE, // nearest interpolate
 
         LM_GGML_OP_FLASH_ATTN,
@@ -1020,9 +1021,9 @@ extern "C" {
             struct lm_ggml_tensor  * b,
             float                 eps);
 
-    // A: n columns, m rows
-    // B: n columns, p rows  (i.e. we transpose it internally)
-    // result is m columns, p rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_mul_mat(
             struct lm_ggml_context * ctx,
             struct lm_ggml_tensor  * a,
diff --git a/cpp/llama.cpp b/cpp/llama.cpp
index 3e214da..b1c3fd0 100644
--- a/cpp/llama.cpp
+++ b/cpp/llama.cpp
@@ -7504,7 +7504,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
         }
     }
 
-    const llama_token eos = llama_token_eos(ctx);
+    const llama_token eos = llama_token_eos(&ctx->model);
 
     std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
     std::vector<llama_grammar_candidate>                              candidates_grammar;
@@ -7714,7 +7714,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
 void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
     const int64_t t_start_sample_us = lm_ggml_time_us();
 
-    if (token == llama_token_eos(ctx)) {
+    if (token == llama_token_eos(&ctx->model)) {
         for (const auto & stack : grammar->stacks) {
             if (stack.empty()) {
                 return;
@@ -8923,7 +8923,7 @@ struct llama_context * llama_new_context_with_model(
             // build worst-case graph
             int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
             int n_past = cparams.n_ctx - n_tokens;
-            llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
 
 #ifdef LM_GGML_USE_METAL
@@ -9684,43 +9684,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].text.c_str();
+const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].text.c_str();
 }
 
-float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].score;
+float llama_token_get_score(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].score;
 }
 
-llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
-    return ctx->model.vocab.id_to_token[token].type;
+llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
+    return model->vocab.id_to_token[token].type;
 }
 
-llama_token llama_token_bos(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_bos_id;
+llama_token llama_token_bos(const struct llama_model * model) {
+    return model->vocab.special_bos_id;
 }
 
-llama_token llama_token_eos(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_eos_id;
+llama_token llama_token_eos(const struct llama_model * model) {
+    return model->vocab.special_eos_id;
 }
 
-llama_token llama_token_nl(const struct llama_context * ctx) {
-    return ctx->model.vocab.linefeed_id;
+llama_token llama_token_nl(const struct llama_model * model) {
+    return model->vocab.linefeed_id;
 }
-llama_token llama_token_prefix(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_prefix_id;
+
+llama_token llama_token_prefix(const struct llama_model * model) {
+    return model->vocab.special_prefix_id;
 }
 
-llama_token llama_token_middle(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_middle_id;
+llama_token llama_token_middle(const struct llama_model * model) {
+    return model->vocab.special_middle_id;
 }
 
-llama_token llama_token_suffix(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_suffix_id;
+llama_token llama_token_suffix(const struct llama_model * model) {
+    return model->vocab.special_suffix_id;
 }
 
-llama_token llama_token_eot(const struct llama_context * ctx) {
-    return ctx->model.vocab.special_eot_id;
+llama_token llama_token_eot(const struct llama_model * model) {
+    return model->vocab.special_eot_id;
 }
 
 int llama_tokenize(
diff --git a/cpp/llama.h b/cpp/llama.h
index 04ff963..fa6afd9 100644
--- a/cpp/llama.h
+++ b/cpp/llama.h
@@ -494,21 +494,22 @@ extern "C" {
     // Vocab
     //
 
-    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
 
-    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
+    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
 
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
+    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
 
     // Special tokens
-    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
+    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
+
     // codellama infill tokens
-    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
-    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
-    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
-    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
+    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
+    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
+    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
+    LLAMA_API llama_token llama_token_eot   (const struct llama_model * model); // End of infill middle
 
     //
     // Tokenization
diff --git a/cpp/log.h b/cpp/log.h
index b99e9e7..a5084d8 100644
--- a/cpp/log.h
+++ b/cpp/log.h
@@ -97,22 +97,23 @@
     #define LOG_TEE_TARGET stderr
 #endif
 
+// NOTE: currently disabled as it produces too many log files
 // Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-    static std::string pid;
-    if (pid.empty())
-    {
-        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-        //  it's not the same as "pid" but is unique enough to solve multiple instances
-        //  trying to write to the same log.
-        std::stringstream ss;
-        ss << std::this_thread::get_id();
-        pid = ss.str();
-    }
-
-    return pid;
-}
+//inline std::string log_get_pid()
+//{
+//    static std::string pid;
+//    if (pid.empty())
+//    {
+//        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+//        //  it's not the same as "pid" but is unique enough to solve multiple instances
+//        //  trying to write to the same log.
+//        std::stringstream ss;
+//        ss << std::this_thread::get_id();
+//        pid = ss.str();
+//    }
+//
+//    return pid;
+//}
 
 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
@@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
     std::stringstream buf;
 
     buf << log_file_basename;
-    buf << ".";
-    buf << log_get_pid();
+    //buf << ".";
+    //buf << log_get_pid();
     buf << ".";
     buf << log_file_extension;
 
diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp
index aeae7eb..ba6c860 100644
--- a/cpp/rn-llama.hpp
+++ b/cpp/rn-llama.hpp
@@ -349,7 +349,7 @@ struct llama_rn_context
         if (params.n_predict == 0)
         {
             has_next_token = false;
-            result.tok = llama_token_eos(ctx);
+            result.tok = llama_token_eos(model);
             return result;
         }
 
@@ -384,7 +384,7 @@ struct llama_rn_context
         // decrement remaining sampling budget
         --n_remain;
 
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
+        if (!embd.empty() && embd.back() == llama_token_eos(model))
         {
             // stopping_word = llama_token_to_piece(ctx, embd.back());
             has_next_token = false;
diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp
index 6f0af3c..5258d4e 100644
--- a/cpp/sampling.cpp
+++ b/cpp/sampling.cpp
@@ -147,7 +147,7 @@ llama_token llama_sampling_sample(
 
     // apply penalties
     if (!prev.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx_main)];
+        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
 
         llama_sample_repetition_penalties(ctx_main, &cur_p,
                 prev.data() + prev.size() - penalty_last_n,
@@ -155,7 +155,7 @@ llama_token llama_sampling_sample(
 
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
+                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
                     cur_p.data[idx].logit = nl_logit;
                     break;
                 }
diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm
index 29aab44..ab8ea2c 100644
--- a/ios/RNLlamaContext.mm
+++ b/ios/RNLlamaContext.mm
@@ -177,7 +177,7 @@ - (NSDictionary *)completion:(NSDictionary *)params
 
     sparams.logit_bias.clear();
     if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
-        sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY;
     }
 
     if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {
diff --git a/llama.cpp b/llama.cpp
index 96981f3..34b2a5e 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 96981f37b1e3f450d9e63e571514217bf60f0a7f
+Subproject commit 34b2a5e1ee4fe6295fb4420eb91131d743694c65
diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch
index ea1c549..add06b2 100644
--- a/scripts/ggml-metal.m.patch
+++ b/scripts/ggml-metal.m.patch
@@ -1,6 +1,6 @@
---- ggml-metal.m.orig	2023-10-25 10:31:15
-+++ ggml-metal.m	2023-10-25 10:31:37
-@@ -208,7 +208,7 @@
+--- ggml-metal.m.orig	2023-10-27 12:54:53
++++ ggml-metal.m	2023-10-27 12:54:54
+@@ -209,7 +209,7 @@
          } else {
              LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
  
diff --git a/scripts/log.h.patch b/scripts/log.h.patch
index 71b16d7..f531ade 100644
--- a/scripts/log.h.patch
+++ b/scripts/log.h.patch
@@ -1,9 +1,9 @@
---- log.h.orig	2023-10-12 09:37:10
-+++ log.h	2023-10-12 09:36:47
-@@ -313,6 +313,19 @@
+--- log.h.orig	2023-10-27 12:54:53
++++ log.h	2023-10-27 12:54:54
+@@ -314,6 +314,19 @@
      #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
  #endif
-
+ 
 +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING)
 +#include <android/log.h>
 +#define LLAMA_ANDROID_LOG_TAG "RNLLAMA_LOG_ANDROID"