From 455d83fc2bb7ca0ace18f5691b2300cb4444c810 Mon Sep 17 00:00:00 2001 From: Jhen Date: Fri, 27 Oct 2023 12:54:49 +0800 Subject: [PATCH] feat: sync llama.cpp --- android/src/main/jni.cpp | 2 +- cpp/build-info.h | 4 +- cpp/common.cpp | 8 +- cpp/ggml-metal-llama.metal | 10 +- cpp/ggml-metal.m | 18 +- cpp/ggml.c | 446 ++++++++++++++++++++++++++++++------- cpp/ggml.h | 15 +- cpp/llama.cpp | 47 ++-- cpp/llama.h | 21 +- cpp/log.h | 35 +-- cpp/rn-llama.hpp | 4 +- cpp/sampling.cpp | 4 +- ios/RNLlamaContext.mm | 2 +- llama.cpp | 2 +- scripts/ggml-metal.m.patch | 6 +- scripts/log.h.patch | 8 +- 16 files changed, 465 insertions(+), 167 deletions(-) diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index fa71c8e..45ecc65 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -328,7 +328,7 @@ Java_com_rnllama_LlamaContext_doCompletion( sparams.logit_bias.clear(); if (ignore_eos) { - sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY; + sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY; } const int n_vocab = llama_n_vocab(llama_get_model(llama->ctx)); diff --git a/cpp/build-info.h b/cpp/build-info.h index 832eba8..e36ff30 100644 --- a/cpp/build-info.h +++ b/cpp/build-info.h @@ -1,8 +1,8 @@ #ifndef BUILD_INFO_H #define BUILD_INFO_H -#define BUILD_NUMBER 1414 -#define BUILD_COMMIT "96981f3" +#define BUILD_NUMBER 1429 +#define BUILD_COMMIT "34b2a5e" #define BUILD_COMPILER "" #define BUILD_TARGET "unknown" diff --git a/cpp/common.cpp b/cpp/common.cpp index a975e41..131673f 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -880,13 +880,13 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } { LOG("warming up the model with an empty run\n"); - std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_tokens_rm(lctx, -1, -1); llama_reset_timings(lctx); @@ -941,7 +941,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t } std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - const llama_token bos_id = llama_token_bos(ctx); + const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); std::string piece; std::string result; @@ -1186,7 +1186,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx)); + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index 69fc713..f4b4605 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -125,9 +125,17 @@ kernel void kernel_mul_row( } kernel void kernel_scale( + device const float * src0, + device float * dst, + constant float & scale, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * scale; +} + +kernel void kernel_scale_4( device const float4 * src0, device float4 * dst, - constant float & scale, + constant float & scale, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * scale; } diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index ba7e743..c858811 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -62,6 +62,7 @@ LM_GGML_METAL_DECL_KERNEL(mul); LM_GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast LM_GGML_METAL_DECL_KERNEL(scale); + LM_GGML_METAL_DECL_KERNEL(scale_4); LM_GGML_METAL_DECL_KERNEL(silu); LM_GGML_METAL_DECL_KERNEL(relu); LM_GGML_METAL_DECL_KERNEL(gelu); @@ -249,6 +250,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char* format, LM_GGML_METAL_ADD_KERNEL(mul); LM_GGML_METAL_ADD_KERNEL(mul_row); LM_GGML_METAL_ADD_KERNEL(scale); + LM_GGML_METAL_ADD_KERNEL(scale_4); LM_GGML_METAL_ADD_KERNEL(silu); LM_GGML_METAL_ADD_KERNEL(relu); LM_GGML_METAL_ADD_KERNEL(gelu); @@ -347,6 +349,7 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_DEL_KERNEL(mul); LM_GGML_METAL_DEL_KERNEL(mul_row); LM_GGML_METAL_DEL_KERNEL(scale); + LM_GGML_METAL_DEL_KERNEL(scale_4); LM_GGML_METAL_DEL_KERNEL(silu); LM_GGML_METAL_DEL_KERNEL(relu); LM_GGML_METAL_DEL_KERNEL(gelu); @@ -923,15 +926,20 @@ void lm_ggml_metal_graph_compute( const float scale = *(const float *) src1->data; - [encoder setComputePipelineState:ctx->pipeline_scale]; + int64_t n = lm_ggml_nelements(dst); + + if (n % 4 == 0) { + n /= 4; + [encoder setComputePipelineState:ctx->pipeline_scale_4]; + } else { + [encoder setComputePipelineState:ctx->pipeline_scale]; + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = lm_ggml_nelements(dst); - LM_GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case LM_GGML_OP_UNARY: switch (lm_ggml_get_unary_op(gf->nodes[i])) { diff --git a/cpp/ggml.c b/cpp/ggml.c index d822a32..525ae1c 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -571,7 +571,6 @@ int64_t lm_ggml_cycles_per_ms(void) { #define lm_ggml_perf_cycles_per_ms() 0 #endif - // // cache line // @@ -1828,7 +1827,6 @@ lm_ggml_type_traits_t lm_ggml_internal_get_type_traits(enum lm_ggml_type type) { return type_traits[type]; } - // // simd mappings // @@ -4057,16 +4055,17 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", "CONV_TRANSPOSE_1D", "CONV_2D", + "CONV_2D_STAGE_0", + "CONV_2D_STAGE_1", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", - "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4092,7 +4091,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71"); +static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73"); static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "none", @@ -4143,16 +4142,17 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", "conv_transpose_1d(x)", "conv_2d(x)", + "conv_2d_stage_0(x)", + "conv_2d_stage_1(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", - "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4178,7 +4178,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(LM_GGML_OP_COUNT == 71, "LM_GGML_OP_COUNT != 71"); +static_assert(LM_GGML_OP_COUNT == 73, "LM_GGML_OP_COUNT != 73"); static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2"); @@ -4209,8 +4209,10 @@ static void lm_ggml_setup_op_has_task_pass(void) { p[LM_GGML_OP_CONV_1D ] = true; p[LM_GGML_OP_CONV_1D_STAGE_0 ] = true; p[LM_GGML_OP_CONV_1D_STAGE_1 ] = true; - p[LM_GGML_OP_CONV_2D ] = true; p[LM_GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[LM_GGML_OP_CONV_2D ] = true; + p[LM_GGML_OP_CONV_2D_STAGE_0 ] = true; + p[LM_GGML_OP_CONV_2D_STAGE_1 ] = true; p[LM_GGML_OP_CONV_TRANSPOSE_2D ] = true; p[LM_GGML_OP_FLASH_ATTN_BACK ] = true; p[LM_GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -5954,7 +5956,6 @@ struct lm_ggml_tensor * lm_ggml_sqrt_inplace( return lm_ggml_sqrt_impl(ctx, a, true); } - // lm_ggml_log static struct lm_ggml_tensor * lm_ggml_log_impl( @@ -6008,7 +6009,6 @@ struct lm_ggml_tensor * lm_ggml_sum( return result; } - // lm_ggml_sum_rows struct lm_ggml_tensor * lm_ggml_sum_rows( @@ -6640,7 +6640,6 @@ struct lm_ggml_tensor * lm_ggml_set_2d_inplace( return lm_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } - // lm_ggml_cpy static struct lm_ggml_tensor * lm_ggml_cpy_impl( @@ -6720,7 +6719,6 @@ struct lm_ggml_tensor * lm_ggml_cont_inplace( return lm_ggml_cont_impl(ctx, a, true); } - // make contiguous, with new shape LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_1d( struct lm_ggml_context * ctx, @@ -7173,7 +7171,6 @@ struct lm_ggml_tensor * lm_ggml_diag( return result; } - // lm_ggml_diag_mask_inf static struct lm_ggml_tensor * lm_ggml_diag_mask_inf_impl( @@ -7285,7 +7282,6 @@ struct lm_ggml_tensor * lm_ggml_soft_max_inplace( return lm_ggml_soft_max_impl(ctx, a, true); } - // lm_ggml_soft_max_back static struct lm_ggml_tensor * lm_ggml_soft_max_back_impl( @@ -7702,7 +7698,11 @@ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_1d( // lm_ggml_conv_2d -struct lm_ggml_tensor * lm_ggml_conv_2d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_0( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, @@ -7721,17 +7721,21 @@ struct lm_ggml_tensor * lm_ggml_conv_2d( is_node = true; } + const int64_t OH = lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + const int64_t OW = lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { - lm_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - lm_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], b->ne[3], + a->ne[2] * a->ne[1] * a->ne[0], + OW, + OH, + b->ne[3], }; - struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F16, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; lm_ggml_set_op_params(result, params, sizeof(params)); - result->op = LM_GGML_OP_CONV_2D; + result->op = LM_GGML_OP_CONV_2D_STAGE_0; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7740,8 +7744,61 @@ struct lm_ggml_tensor * lm_ggml_conv_2d( } -// lm_ggml_conv_2d_sk_p0 +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// a: [OC, IC, KH, KW] +// b: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static struct lm_ggml_tensor * lm_ggml_conv_2d_stage_1( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b) { + + bool is_node = false; + + if (a->grad || b->grad) { + LM_GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + b->ne[2], + a->ne[3], + b->ne[3], + }; + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne); + + result->op = LM_GGML_OP_CONV_2D_STAGE_1; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; + +} + +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OC, OH, OW] +struct lm_ggml_tensor * lm_ggml_conv_2d( + struct lm_ggml_context * ctx, + struct lm_ggml_tensor * a, + struct lm_ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct lm_ggml_tensor * result = lm_ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW] + result = lm_ggml_conv_2d_stage_1(ctx, a, result); + + return result; + +} + +// lm_ggml_conv_2d_sk_p0 struct lm_ggml_tensor * lm_ggml_conv_2d_sk_p0( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -8180,7 +8237,6 @@ static struct lm_ggml_tensor * lm_ggml_add_rel_pos_impl( return result; } - struct lm_ggml_tensor * lm_ggml_add_rel_pos( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, @@ -8625,8 +8681,6 @@ struct lm_ggml_tensor * lm_ggml_map_custom3_inplace( return lm_ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } - - // lm_ggml_cross_entropy_loss struct lm_ggml_tensor * lm_ggml_cross_entropy_loss( @@ -9828,7 +9882,6 @@ static void lm_ggml_compute_forward_add1( } } - // lm_ggml_compute_forward_acc static void lm_ggml_compute_forward_acc_f32( @@ -9968,7 +10021,6 @@ static void lm_ggml_compute_forward_sub_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef LM_GGML_USE_ACCELERATE vDSP_vsub( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, @@ -10149,7 +10201,6 @@ static void lm_ggml_compute_forward_div_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef LM_GGML_USE_ACCELERATE UNUSED(lm_ggml_vec_div_f32); @@ -10287,7 +10338,6 @@ static void lm_ggml_compute_forward_sqrt( } } - // lm_ggml_compute_forward_log static void lm_ggml_compute_forward_log_f32( @@ -12120,7 +12170,6 @@ static void lm_ggml_compute_forward_out_prod_f32( } } - //int64_t t1 = lm_ggml_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; @@ -12316,7 +12365,6 @@ static void lm_ggml_compute_forward_scale_f32( const size_t nb1 = dst->nb[1]; - for (int i1 = ir0; i1 < ir1; i1++) { if (dst->data != src0->data) { // src0 is same shape as dst => same indices @@ -12714,7 +12762,6 @@ static void lm_ggml_compute_forward_get_rows_back_f32( } } - static void lm_ggml_compute_forward_get_rows_back( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -13997,6 +14044,7 @@ static void lm_ggml_compute_forward_conv_1d_f32( } } +// TODO: reuse lm_ggml_mul_mat or implement lm_ggml_im2col and remove stage_0 and stage_1 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, lm_ggml_fp16_t * A, lm_ggml_fp16_t * B, @@ -14298,6 +14346,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f16_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; } @@ -14370,7 +14421,7 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32( const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; + dst_data[i00*ne02 + i02] = src[i00]; } } } @@ -14389,6 +14440,9 @@ static void lm_ggml_compute_forward_conv_transpose_1d_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; } @@ -14450,28 +14504,190 @@ static void lm_ggml_compute_forward_conv_transpose_1d( // lm_ggml_compute_forward_conv_2d -static void lm_ggml_compute_forward_conv_2d_f16_f32( +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void lm_ggml_compute_forward_conv_2d_stage_0_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F16); + + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + // const int64_t OC = ne03; + // const int64_t IC = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OH = ne2; + const int64_t OW = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == LM_GGML_TASK_INIT) { + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; + } + + if (params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// src0: [OC, IC, KH, KW] +// src1: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static void lm_ggml_compute_forward_conv_2d_stage_1_f16( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F16); LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); int64_t t0 = lm_ggml_perf_time_us(); UNUSED(t0); + if (params->type == LM_GGML_TASK_INIT) { + return; + } + + if (params->type == LM_GGML_TASK_FINALIZE) { + return; + } + LM_GGML_TENSOR_BINARY_OP_LOCALS; + LM_GGML_ASSERT(nb00 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb10 == sizeof(lm_ggml_fp16_t)); + LM_GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne13; + const int OH = ne12; + const int OW = ne11; + + const int OC = ne03; + const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; + + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k] + lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + +static void lm_ggml_compute_forward_conv_2d_f16_f32( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F16); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT( dst->type == LM_GGML_TYPE_F32); + + int64_t t0 = lm_ggml_perf_time_us(); + UNUSED(t0); + + LM_GGML_TENSOR_BINARY_OP_LOCALS + + // src1: image [N, IC, IH, IW] + // src0: kernel [OC, IC, KH, KW] + // dst: result [N, OC, OH, OW] + // ne12: IC + // ne0: OW + // ne1: OH + // nk0: KW + // nk1: KH + // ne13: N + + const int N = ne13; + const int IC = ne12; + const int IH = ne11; + const int IW = ne10; + + const int OC = ne03; + // const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int OH = ne1; + const int OW = ne0; + const int ith = params->ith; const int nth = params->nth; - const int nk0 = ne00; - const int nk1 = ne01; + // const int nk0 = ne00; + // const int nk1 = ne01; // size of the convolution row - the kernel size unrolled across all channels - const int ew0 = nk0*nk1*ne02; + // const int ew0 = nk0*nk1*ne02; + // ew0: IC*KH*KW const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -14487,24 +14703,27 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32( memset(params->wdata, 0, params->wsize); // prepare source data (src1) + // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW] + { lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - lm_ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); - - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; - - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - LM_GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + for (int in = 0; in < N; in++) { + for (int iic = 0; iic < IC; iic++) { + for (int ioh = 0; ioh < OH; ioh++) { + for (int iow = 0; iow < OW; iow++) { + + // micro kernel + lm_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + + for (int ikh = 0; ikh < KH; ikh++) { + for (int ikw = 0; ikw < KW; ikw++) { + const int iiw = iow*s0 + ikw*d0 - p0; + const int iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = LM_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -14521,30 +14740,22 @@ static void lm_ggml_compute_forward_conv_2d_f16_f32( return; } - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; + lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + // wdata: [N*OH*OW, IC*KH*KW] + // dst: result [N, OC, OH, OW] + // src0: kernel [OC, IC, KH, KW] - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; - lm_ggml_fp16_t * const wdata = (lm_ggml_fp16_t *) params->wdata + 0; + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + lm_ggml_fp16_t * A = (lm_ggml_fp16_t *)src0->data; // [m, k] + lm_ggml_fp16_t * B = (lm_ggml_fp16_t *)wdata + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m * k] - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); - - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - lm_ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (lm_ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (lm_ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); - } - } - } + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); } } @@ -14570,6 +14781,48 @@ static void lm_ggml_compute_forward_conv_2d( } } +static void lm_ggml_compute_forward_conv_2d_stage_0( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst); + } break; + case LM_GGML_TYPE_F32: + { + LM_GGML_ASSERT(false); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + +static void lm_ggml_compute_forward_conv_2d_stage_1( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + const struct lm_ggml_tensor * src1, + struct lm_ggml_tensor * dst) { + switch (src0->type) { + case LM_GGML_TYPE_F16: + { + lm_ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst); + } break; + case LM_GGML_TYPE_F32: + { + LM_GGML_ASSERT(false); + } break; + default: + { + LM_GGML_ASSERT(false); + } break; + } +} + // lm_ggml_compute_forward_conv_transpose_2d static void lm_ggml_compute_forward_conv_transpose_2d( @@ -14628,6 +14881,8 @@ static void lm_ggml_compute_forward_conv_transpose_2d( } } + memset(dst->data, 0, lm_ggml_nbytes(dst)); + return; } @@ -16126,7 +16381,6 @@ static void lm_ggml_compute_forward_add_rel_pos_f32( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - for (int64_t i13 = ip0; i13 < ip1; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { @@ -16193,7 +16447,6 @@ static void lm_ggml_compute_forward_map_unary_f32( } } - static void lm_ggml_compute_forward_map_unary( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -16241,7 +16494,6 @@ static void lm_ggml_compute_forward_map_binary_f32( } } - static void lm_ggml_compute_forward_map_binary( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, @@ -16293,7 +16545,6 @@ static void lm_ggml_compute_forward_map_custom2_f32( fun(dst, a, b); } - // lm_ggml_compute_forward_map_custom3 static void lm_ggml_compute_forward_map_custom3_f32( @@ -16568,7 +16819,6 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back_f32( lm_ggml_vec_sub_f32(nc, ds0, ds0, s1); lm_ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); - #ifndef NDEBUG for (int i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); @@ -16596,12 +16846,15 @@ static void lm_ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * tensor) { LM_GGML_ASSERT(params); + if (tensor->op == LM_GGML_OP_NONE) { + return; + } + #ifdef LM_GGML_USE_CUBLAS bool skip_cpu = lm_ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { @@ -16804,6 +17057,14 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru { lm_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; + case LM_GGML_OP_CONV_2D_STAGE_0: + { + lm_ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case LM_GGML_OP_CONV_2D_STAGE_1: + { + lm_ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; case LM_GGML_OP_CONV_TRANSPOSE_2D: { lm_ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17733,11 +17994,19 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { LM_GGML_ASSERT(false); // TODO: not implemented } break; + case LM_GGML_OP_CONV_TRANSPOSE_1D: + { + LM_GGML_ASSERT(false); // TODO: not implemented + } break; case LM_GGML_OP_CONV_2D: { LM_GGML_ASSERT(false); // TODO: not implemented } break; - case LM_GGML_OP_CONV_TRANSPOSE_1D: + case LM_GGML_OP_CONV_2D_STAGE_0: + { + LM_GGML_ASSERT(false); // TODO: not implemented + } break; + case LM_GGML_OP_CONV_2D_STAGE_1: { LM_GGML_ASSERT(false); // TODO: not implemented } break; @@ -18666,6 +18935,7 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th const int64_t ne0 = node->ne[0]; const int64_t ne1 = node->ne[1]; const int64_t ne2 = node->ne[2]; + const int64_t ne3 = node->ne[3]; const int64_t nk = ne00*ne01; const int64_t ew0 = nk * ne02; @@ -18676,7 +18946,8 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th if (node->src[0]->type == LM_GGML_TYPE_F16 && node->src[1]->type == LM_GGML_TYPE_F32) { - cur = sizeof(lm_ggml_fp16_t)*(ne0*ne1*ew0); + // im2col: [N*OH*OW, IC*KH*KW] + cur = sizeof(lm_ggml_fp16_t)*(ne3*ne0*ne1*ew0); } else if (node->src[0]->type == LM_GGML_TYPE_F32 && node->src[1]->type == LM_GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); @@ -18686,6 +18957,14 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th work_size = MAX(work_size, cur); } break; + case LM_GGML_OP_CONV_2D_STAGE_0: + { + n_tasks = n_threads; + } break; + case LM_GGML_OP_CONV_2D_STAGE_1: + { + n_tasks = n_threads; + } break; case LM_GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -19874,7 +20153,6 @@ static enum lm_ggml_opt_result lm_ggml_opt_adam( opt->loss_after = fx; - // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { LM_GGML_PRINT_DEBUG("converged\n"); diff --git a/cpp/ggml.h b/cpp/ggml.h index b252464..0416777 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -401,15 +401,16 @@ extern "C" { LM_GGML_OP_ALIBI, LM_GGML_OP_CLAMP, LM_GGML_OP_CONV_1D, - LM_GGML_OP_CONV_2D, + LM_GGML_OP_CONV_1D_STAGE_0, // internal + LM_GGML_OP_CONV_1D_STAGE_1, // internal LM_GGML_OP_CONV_TRANSPOSE_1D, + LM_GGML_OP_CONV_2D, + LM_GGML_OP_CONV_2D_STAGE_0, // internal + LM_GGML_OP_CONV_2D_STAGE_1, // internal LM_GGML_OP_CONV_TRANSPOSE_2D, LM_GGML_OP_POOL_1D, LM_GGML_OP_POOL_2D, - LM_GGML_OP_CONV_1D_STAGE_0, // internal - LM_GGML_OP_CONV_1D_STAGE_1, // internal - LM_GGML_OP_UPSCALE, // nearest interpolate LM_GGML_OP_FLASH_ATTN, @@ -1020,9 +1021,9 @@ extern "C" { struct lm_ggml_tensor * b, float eps); - // A: n columns, m rows - // B: n columns, p rows (i.e. we transpose it internally) - // result is m columns, p rows + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] LM_GGML_API struct lm_ggml_tensor * lm_ggml_mul_mat( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, diff --git a/cpp/llama.cpp b/cpp/llama.cpp index 3e214da..b1c3fd0 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -7504,7 +7504,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - const llama_token eos = llama_token_eos(ctx); + const llama_token eos = llama_token_eos(&ctx->model); std::vector, llama_partial_utf8>> candidates_decoded; std::vector candidates_grammar; @@ -7714,7 +7714,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { const int64_t t_start_sample_us = lm_ggml_time_us(); - if (token == llama_token_eos(ctx)) { + if (token == llama_token_eos(&ctx->model)) { for (const auto & stack : grammar->stacks) { if (stack.empty()) { return; @@ -8923,7 +8923,7 @@ struct llama_context * llama_new_context_with_model( // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_past = cparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); #ifdef LM_GGML_USE_METAL @@ -9684,43 +9684,44 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].text.c_str(); +const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].text.c_str(); } -float llama_token_get_score(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].score; +float llama_token_get_score(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].type; +llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].type; } -llama_token llama_token_bos(const struct llama_context * ctx) { - return ctx->model.vocab.special_bos_id; +llama_token llama_token_bos(const struct llama_model * model) { + return model->vocab.special_bos_id; } -llama_token llama_token_eos(const struct llama_context * ctx) { - return ctx->model.vocab.special_eos_id; +llama_token llama_token_eos(const struct llama_model * model) { + return model->vocab.special_eos_id; } -llama_token llama_token_nl(const struct llama_context * ctx) { - return ctx->model.vocab.linefeed_id; +llama_token llama_token_nl(const struct llama_model * model) { + return model->vocab.linefeed_id; } -llama_token llama_token_prefix(const struct llama_context * ctx) { - return ctx->model.vocab.special_prefix_id; + +llama_token llama_token_prefix(const struct llama_model * model) { + return model->vocab.special_prefix_id; } -llama_token llama_token_middle(const struct llama_context * ctx) { - return ctx->model.vocab.special_middle_id; +llama_token llama_token_middle(const struct llama_model * model) { + return model->vocab.special_middle_id; } -llama_token llama_token_suffix(const struct llama_context * ctx) { - return ctx->model.vocab.special_suffix_id; +llama_token llama_token_suffix(const struct llama_model * model) { + return model->vocab.special_suffix_id; } -llama_token llama_token_eot(const struct llama_context * ctx) { - return ctx->model.vocab.special_eot_id; +llama_token llama_token_eot(const struct llama_model * model) { + return model->vocab.special_eot_id; } int llama_tokenize( diff --git a/cpp/llama.h b/cpp/llama.h index 04ff963..fa6afd9 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -494,21 +494,22 @@ extern "C" { // Vocab // - LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token); - LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token); + LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token); + LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); // Special tokens - LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence - LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line + LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line + // codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle + LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix + LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle + LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix + LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle // // Tokenization diff --git a/cpp/log.h b/cpp/log.h index b99e9e7..a5084d8 100644 --- a/cpp/log.h +++ b/cpp/log.h @@ -97,22 +97,23 @@ #define LOG_TEE_TARGET stderr #endif +// NOTE: currently disabled as it produces too many log files // Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} +//inline std::string log_get_pid() +//{ +// static std::string pid; +// if (pid.empty()) +// { +// // std::this_thread::get_id() is the most portable way of obtaining a "process id" +// // it's not the same as "pid" but is unique enough to solve multiple instances +// // trying to write to the same log. +// std::stringstream ss; +// ss << std::this_thread::get_id(); +// pid = ss.str(); +// } +// +// return pid; +//} // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" @@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base std::stringstream buf; buf << log_file_basename; - buf << "."; - buf << log_get_pid(); + //buf << "."; + //buf << log_get_pid(); buf << "."; buf << log_file_extension; diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp index aeae7eb..ba6c860 100644 --- a/cpp/rn-llama.hpp +++ b/cpp/rn-llama.hpp @@ -349,7 +349,7 @@ struct llama_rn_context if (params.n_predict == 0) { has_next_token = false; - result.tok = llama_token_eos(ctx); + result.tok = llama_token_eos(model); return result; } @@ -384,7 +384,7 @@ struct llama_rn_context // decrement remaining sampling budget --n_remain; - if (!embd.empty() && embd.back() == llama_token_eos(ctx)) + if (!embd.empty() && embd.back() == llama_token_eos(model)) { // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index 6f0af3c..5258d4e 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -147,7 +147,7 @@ llama_token llama_sampling_sample( // apply penalties if (!prev.empty()) { - const float nl_logit = logits[llama_token_nl(ctx_main)]; + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; llama_sample_repetition_penalties(ctx_main, &cur_p, prev.data() + prev.size() - penalty_last_n, @@ -155,7 +155,7 @@ llama_token llama_sampling_sample( if (!penalize_nl) { for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(ctx_main)) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { cur_p.data[idx].logit = nl_logit; break; } diff --git a/ios/RNLlamaContext.mm b/ios/RNLlamaContext.mm index 29aab44..ab8ea2c 100644 --- a/ios/RNLlamaContext.mm +++ b/ios/RNLlamaContext.mm @@ -177,7 +177,7 @@ - (NSDictionary *)completion:(NSDictionary *)params sparams.logit_bias.clear(); if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) { - sparams.logit_bias[llama_token_eos(llama->ctx)] = -INFINITY; + sparams.logit_bias[llama_token_eos(llama->model)] = -INFINITY; } if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) { diff --git a/llama.cpp b/llama.cpp index 96981f3..34b2a5e 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit 96981f37b1e3f450d9e63e571514217bf60f0a7f +Subproject commit 34b2a5e1ee4fe6295fb4420eb91131d743694c65 diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index ea1c549..add06b2 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig 2023-10-25 10:31:15 -+++ ggml-metal.m 2023-10-25 10:31:37 -@@ -208,7 +208,7 @@ +--- ggml-metal.m.orig 2023-10-27 12:54:53 ++++ ggml-metal.m 2023-10-27 12:54:54 +@@ -209,7 +209,7 @@ } else { LM_GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); diff --git a/scripts/log.h.patch b/scripts/log.h.patch index 71b16d7..f531ade 100644 --- a/scripts/log.h.patch +++ b/scripts/log.h.patch @@ -1,9 +1,9 @@ ---- log.h.orig 2023-10-12 09:37:10 -+++ log.h 2023-10-12 09:36:47 -@@ -313,6 +313,19 @@ +--- log.h.orig 2023-10-27 12:54:53 ++++ log.h 2023-10-27 12:54:54 +@@ -314,6 +314,19 @@ #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") #endif - + +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING) +#include +#define LLAMA_ANDROID_LOG_TAG "RNLLAMA_LOG_ANDROID"