diff --git a/cpp/common.cpp b/cpp/common.cpp index f8f9e025..3a82d7c4 100644 --- a/cpp/common.cpp +++ b/cpp/common.cpp @@ -226,6 +226,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + break; + } + + params.grp_attn_n = std::stoi(argv[i]); + } else if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + break; + } + + params.grp_attn_w = std::stoi(argv[i]); } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; @@ -535,9 +549,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -546,9 +559,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers_draft = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -557,25 +569,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LM_GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif +#ifndef LM_GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); +#endif // LM_GGML_USE_CUBLAS + } else if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } +#ifndef LM_GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // LM_GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef LM_GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - LM_GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - + if (split_arg.size() >= LLAMA_MAX_DEVICES) { + invalid_param = true; + break; + } for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); @@ -583,14 +614,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // LM_GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef LM_GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#ifndef LM_GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); #endif // LM_GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -598,6 +623,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.numa = true; } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; + } else if (arg == "--no-display-prompt") { + params.display_prompt = false; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { invalid_param = true; @@ -622,6 +649,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_print = std::stoi(argv[i]); } else if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -804,7 +837,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); + printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -901,16 +934,22 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" number of layers to store in VRAM\n"); printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef LM_GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " LM_GGML_CUBLAS_NAME " instead of custom mul_mat_q " LM_GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // LM_GGML_USE_CUBLAS + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif - printf(" --verbose-prompt print prompt before generation\n"); + printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); + printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); + printf(" -gan N, --grp-attn-n N\n"); + printf(" group-attention factor (default: %d)\n", params.grp_attn_n); + printf(" -gaw N, --grp-attn-w N\n"); + printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); printf(" -dkvc, --dump-kv-cache\n"); printf(" verbose print of the KV cache\n"); printf(" -nkvo, --no-kv-offload\n"); @@ -926,12 +965,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); + printf(" draft model for speculative decoding\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -ptc N, --print-token-count N\n"); + printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -1021,6 +1062,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; @@ -1035,6 +1077,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & } static lm_ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return LM_GGML_TYPE_F32; + } if (s == "f16") { return LM_GGML_TYPE_F16; } @@ -1400,6 +1445,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", lm_ggml_cpu_has_arm_fma() ? "true" : "false"); fprintf(stream, "cpu_has_avx: %s\n", lm_ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx_vnni: %s\n", lm_ggml_cpu_has_avx_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx2: %s\n", lm_ggml_cpu_has_avx2() ? "true" : "false"); fprintf(stream, "cpu_has_avx512: %s\n", lm_ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", lm_ggml_cpu_has_avx512_vbmi() ? "true" : "false"); @@ -1545,6 +1591,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); } // diff --git a/cpp/common.h b/cpp/common.h index 254df736..4e1db539 100644 --- a/cpp/common.h +++ b/cpp/common.h @@ -40,7 +40,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 16; // number of tokens to draft during speculative decoding + int32_t n_draft = 8; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode @@ -48,9 +48,13 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor @@ -111,6 +115,7 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation + bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c index ac293c20..2df93eea 100644 --- a/cpp/ggml-alloc.c +++ b/cpp/ggml-alloc.c @@ -72,7 +72,7 @@ static void remove_allocated_tensor(lm_ggml_tallocr_t alloc, struct lm_ggml_tens // check if a tensor is allocated by this buffer static bool lm_ggml_tallocr_is_own(lm_ggml_tallocr_t alloc, const struct lm_ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer; + return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); } static bool lm_ggml_is_view(struct lm_ggml_tensor * t) { @@ -102,8 +102,6 @@ void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tens } } - AT_PRINTF("block %d\n", best_fit_block); - if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; @@ -117,6 +115,7 @@ void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tens return; } } + struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; block->addr = (char*)block->addr + size; @@ -129,6 +128,8 @@ void lm_ggml_tallocr_alloc(lm_ggml_tallocr_t alloc, struct lm_ggml_tensor * tens } } + AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); + tensor->data = addr; tensor->buffer = alloc->buffer; if (!alloc->measure) { @@ -229,6 +230,7 @@ void lm_ggml_tallocr_reset(lm_ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = lm_ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + lm_ggml_backend_buffer_reset(alloc->buffer); } } @@ -263,9 +265,9 @@ lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment) { return alloc; } -lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend) { +lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_buft(struct lm_ggml_backend_buffer_type * buft) { // create a backend buffer to get the correct tensor allocation sizes - lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, 1); + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, 1); // TODO: move alloc initialization to a common lm_ggml_tallocr_new_impl function lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); @@ -275,13 +277,22 @@ lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backen return alloc; } -lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { - lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_buffer(backend, size); +lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend) { + return lm_ggml_tallocr_new_measure_from_buft(lm_ggml_backend_get_default_buffer_type(backend)); +} + +lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buft(struct lm_ggml_backend_buffer_type * buft, size_t size) { + // create a backend buffer to get the correct tensor allocation sizes + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size); lm_ggml_tallocr_t alloc = lm_ggml_tallocr_new_from_buffer(buffer); alloc->buffer_owned = true; return alloc; } +lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size) { + return lm_ggml_tallocr_new_from_buft(lm_ggml_backend_get_default_buffer_type(backend), size); +} + lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer) { lm_ggml_tallocr_t alloc = (lm_ggml_tallocr_t)malloc(sizeof(struct lm_ggml_tallocr)); @@ -449,11 +460,10 @@ static void init_view(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * view, bo if (update_backend) { view->backend = view->view_src->backend; } - view->buffer = view->view_src->buffer; + // views are initialized in the alloc buffer rather than the view_src buffer + view->buffer = alloc->buffer; view->data = (char *)view->view_src->data + view->view_offs; - // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend - // due to the lm_ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras assert(lm_ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); if (!alloc->measure) { @@ -736,6 +746,10 @@ void lm_ggml_allocr_set_parse_seq(lm_ggml_allocr_t alloc, const int * list, int } void lm_ggml_allocr_free(lm_ggml_allocr_t alloc) { + if (alloc == NULL) { + return; + } + lm_ggml_gallocr_free(alloc->galloc); lm_ggml_tallocr_free(alloc->talloc); free(alloc); @@ -775,11 +789,22 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g } if (nbytes == 0) { - fprintf(stderr, "%s: no tensors to allocate\n", __func__); + // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif return NULL; } lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, nbytes); + if (buffer == NULL) { + // failed to allocate buffer +#ifndef NDEBUG + fprintf(stderr, "%s: failed to allocate buffer\n", __func__); +#endif + return NULL; + } + lm_ggml_tallocr_t tallocr = lm_ggml_tallocr_new_from_buffer(buffer); for (struct lm_ggml_tensor * t = lm_ggml_get_first_tensor(ctx); t != NULL; t = lm_ggml_get_next_tensor(ctx, t)) { @@ -789,6 +814,11 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g } else { lm_ggml_backend_view_init(buffer, t); } + } else { + if (t->view_src != NULL) { + // view of a pre-allocated tensor + lm_ggml_backend_view_init(buffer, t); + } } } diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h index fa3cd61d..0d5416b9 100644 --- a/cpp/ggml-alloc.h +++ b/cpp/ggml-alloc.h @@ -52,8 +52,10 @@ typedef struct lm_ggml_tallocr * lm_ggml_tallocr_t; LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new(void * data, size_t size, size_t alignment); LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure(size_t alignment); -LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buft(struct lm_ggml_backend_buffer_type * buft, size_t size); LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_backend(struct lm_ggml_backend * backend, size_t size); // allocates an owned buffer +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_from_buffer(struct lm_ggml_backend_buffer * buffer); +LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_buft(struct lm_ggml_backend_buffer_type * buft); LM_GGML_API lm_ggml_tallocr_t lm_ggml_tallocr_new_measure_from_backend(struct lm_ggml_backend * backend); LM_GGML_API struct lm_ggml_backend_buffer * lm_ggml_tallocr_get_buffer(lm_ggml_tallocr_t talloc); diff --git a/cpp/ggml-backend-impl.h b/cpp/ggml-backend-impl.h index fd83f84f..47622007 100644 --- a/cpp/ggml-backend-impl.h +++ b/cpp/ggml-backend-impl.h @@ -16,10 +16,14 @@ extern "C" { typedef void * lm_ggml_backend_buffer_type_context_t; struct lm_ggml_backend_buffer_type_i { + const char * (*get_name) (lm_ggml_backend_buffer_type_t buft); lm_ggml_backend_buffer_t (*alloc_buffer) (lm_ggml_backend_buffer_type_t buft, size_t size); size_t (*get_alignment) (lm_ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + size_t (*get_alloc_size) (lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, lm_ggml_backend_cpu_init()) + bool (*is_host) (lm_ggml_backend_buffer_type_t buft); }; struct lm_ggml_backend_buffer_type { @@ -31,15 +35,15 @@ extern "C" { typedef void * lm_ggml_backend_buffer_context_t; struct lm_ggml_backend_buffer_i { - void (*free_buffer)(lm_ggml_backend_buffer_t buffer); - //void (*reset) (lm_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (lm_ggml_backend_buffer_t buffer); - void (*init_tensor)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); - void (*set_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); - // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - void (*cpy_tensor_to) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); + const char * (*get_name) (lm_ggml_backend_buffer_t buffer); + void (*free_buffer)(lm_ggml_backend_buffer_t buffer); + void * (*get_base) (lm_ggml_backend_buffer_t buffer); + void (*init_tensor)(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + void (*set_tensor) (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor) (lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*clear) (lm_ggml_backend_buffer_t buffer, uint8_t value); + void (*reset) (lm_ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct lm_ggml_backend_buffer { @@ -47,6 +51,7 @@ extern "C" { lm_ggml_backend_buffer_type_t buft; lm_ggml_backend_buffer_context_t context; size_t size; + enum lm_ggml_backend_buffer_usage usage; }; lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( @@ -55,6 +60,8 @@ extern "C" { lm_ggml_backend_buffer_context_t context, size_t size); + // do not use directly, use lm_ggml_backend_tensor_copy instead + bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); // // Backend @@ -70,23 +77,21 @@ extern "C" { // buffer allocation lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend); - // (optional) asynchroneous tensor data access + // (optional) asynchronous tensor data access void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - void (*cpy_tensor_to_async) (lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst); - - void (*synchronize) (lm_ggml_backend_t backend); + // (optional) complete all pending operations + void (*synchronize)(lm_ggml_backend_t backend); // compute graph with a plan - lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); + lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph); void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); - // compute graph without a plan - void (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); + // compute graph without a plan (async) + bool (*graph_compute)(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); // check if the backend supports an operation bool (*supports_op)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op); @@ -98,7 +103,6 @@ extern "C" { lm_ggml_backend_context_t context; }; - // // Backend registry // diff --git a/cpp/ggml-backend.c b/cpp/ggml-backend.c index dea815d8..b9654e10 100644 --- a/cpp/ggml-backend.c +++ b/cpp/ggml-backend.c @@ -15,6 +15,10 @@ // backend buffer type +const char * lm_ggml_backend_buft_name(lm_ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); +} + lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -35,6 +39,13 @@ bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, l return buft->iface.supports_backend(buft, backend); } +bool lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_type_t buft) { + if (buft->iface.is_host) { + return buft->iface.is_host(buft); + } + return false; +} + // backend buffer lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( @@ -51,11 +62,16 @@ lm_ggml_backend_buffer_t lm_ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, + /* .usage = */ LM_GGML_BACKEND_BUFFER_USAGE_ANY }; return buffer; } +const char * lm_ggml_backend_buffer_name(lm_ggml_backend_buffer_t buffer) { + return buffer->iface.get_name(buffer); +} + void lm_ggml_backend_buffer_free(lm_ggml_backend_buffer_t buffer) { if (buffer == NULL) { return; @@ -87,17 +103,43 @@ void lm_ggml_backend_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct } size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer) { - return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_type(buffer)); + return lm_ggml_backend_buft_get_alignment(lm_ggml_backend_buffer_get_type(buffer)); } size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { - return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type(buffer), tensor); + return lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_get_type(buffer), tensor); +} + +void lm_ggml_backend_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) { + buffer->iface.clear(buffer, value); +} + +bool lm_ggml_backend_buffer_is_host(lm_ggml_backend_buffer_t buffer) { + return lm_ggml_backend_buft_is_host(lm_ggml_backend_buffer_get_type(buffer)); +} + +void lm_ggml_backend_buffer_set_usage(lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage) { + buffer->usage = usage; } -lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer) { +lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type(lm_ggml_backend_buffer_t buffer) { return buffer->buft; } +void lm_ggml_backend_buffer_reset(lm_ggml_backend_buffer_t buffer) { + if (buffer->iface.reset) { + buffer->iface.reset(buffer); + } +} + +bool lm_ggml_backend_buffer_copy_tensor(const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + lm_ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; + if (dst_buf->iface.cpy_tensor) { + return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + } + return false; +} + // backend const char * lm_ggml_backend_name(lm_ggml_backend_t backend) { @@ -131,30 +173,42 @@ void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_ LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - backend->iface.set_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.set_tensor_async == NULL) { + lm_ggml_backend_tensor_set(tensor, data, offset, size); + } else { + backend->iface.set_tensor_async(backend, tensor, data, offset, size); + } } void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - backend->iface.get_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.get_tensor_async == NULL) { + lm_ggml_backend_tensor_get(tensor, data, offset, size); + } else { + backend->iface.get_tensor_async(backend, tensor, data, offset, size); + } } void lm_ggml_backend_tensor_set(struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); + LM_GGML_ASSERT(buf != NULL && "tensor buffer not set"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { + lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); LM_GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); } void lm_ggml_backend_synchronize(lm_ggml_backend_t backend) { @@ -175,16 +229,10 @@ void lm_ggml_backend_graph_plan_free(lm_ggml_backend_t backend, lm_ggml_backend_ void lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan) { backend->iface.graph_plan_compute(backend, plan); - - // TODO: optional sync - lm_ggml_backend_synchronize(backend); } -void lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { - backend->iface.graph_compute(backend, cgraph); - - // TODO: optional sync - lm_ggml_backend_synchronize(backend); +bool lm_ggml_backend_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { + return backend->iface.graph_compute(backend, cgraph); } bool lm_ggml_backend_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { @@ -209,28 +257,20 @@ static bool lm_ggml_are_same_layout(const struct lm_ggml_tensor * a, const struc } void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); - //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, lm_ggml_backend_name(src->backend), lm_ggml_backend_name(dst->backend), lm_ggml_nbytes(src)); - if (src == dst) { return; } - // TODO: allow backends to support copy to/from same backend - - if (dst->buffer->iface.cpy_tensor_from != NULL) { - dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst); - } else if (src->buffer->iface.cpy_tensor_to != NULL) { - src->buffer->iface.cpy_tensor_to(src->buffer, src, dst); - } else { - // shouldn't be hit when copying from/to CPU - #ifndef NDEBUG - fprintf(stderr, "lm_ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to " - "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name); - #endif + if (lm_ggml_backend_buffer_is_host(src->buffer)) { + lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src)); + } else if (lm_ggml_backend_buffer_is_host(dst->buffer)) { + lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); + } else if (!lm_ggml_backend_buffer_copy_tensor(src, dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, lm_ggml_backend_buffer_name(src->buffer), lm_ggml_backend_buffer_name(dst->buffer)); +#endif size_t nbytes = lm_ggml_nbytes(src); void * data = malloc(nbytes); lm_ggml_backend_tensor_get(src, data, 0, nbytes); @@ -239,6 +279,31 @@ void lm_ggml_backend_tensor_copy(struct lm_ggml_tensor * src, struct lm_ggml_ten } } +void lm_ggml_backend_tensor_copy_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + LM_GGML_ASSERT(lm_ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); + + if (src == dst) { + return; + } + + if (lm_ggml_backend_buft_supports_backend(src->buffer->buft, backend) && lm_ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { + if (backend->iface.cpy_tensor_async != NULL) { + if (backend->iface.cpy_tensor_async(backend, src, dst)) { + return; + } + } + } + + size_t nbytes = lm_ggml_nbytes(src); + if (lm_ggml_backend_buffer_is_host(src->buffer)) { + lm_ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); + } + else { + lm_ggml_backend_tensor_copy(src, dst); + } +} + + // backend registry #define LM_GGML_MAX_BACKENDS_REG 16 @@ -282,7 +347,7 @@ static void lm_ggml_backend_registry_init(void) { void lm_ggml_backend_register(const char * name, lm_ggml_backend_init_fn init_fn, lm_ggml_backend_buffer_type_t default_buffer_type, void * user_data) { LM_GGML_ASSERT(lm_ggml_backend_registry_count < LM_GGML_MAX_BACKENDS_REG); - int id = lm_ggml_backend_registry_count; + size_t id = lm_ggml_backend_registry_count; lm_ggml_backend_registry[id] = (struct lm_ggml_backend_reg) { /* .name = */ {0}, @@ -315,6 +380,8 @@ size_t lm_ggml_backend_reg_find_by_name(const char * name) { return i; } } + + // not found return SIZE_MAX; } @@ -325,15 +392,15 @@ lm_ggml_backend_t lm_ggml_backend_reg_init_backend_from_str(const char * backend const char * params = strchr(backend_str, ':'); char backend_name[128]; if (params == NULL) { - strcpy(backend_name, backend_str); + snprintf(backend_name, sizeof(backend_name), "%s", backend_str); params = ""; } else { - strncpy(backend_name, backend_str, params - backend_str); - backend_name[params - backend_str] = '\0'; + snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str); params++; } size_t backend_i = lm_ggml_backend_reg_find_by_name(backend_name); + if (backend_i == SIZE_MAX) { fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name); return NULL; @@ -372,68 +439,79 @@ lm_ggml_backend_buffer_t lm_ggml_backend_reg_alloc_buffer(size_t i, size_t size) // backend CPU +static const char * lm_ggml_backend_cpu_buffer_name(lm_ggml_backend_buffer_t buffer) { + return "CPU"; + + LM_GGML_UNUSED(buffer); +} + static void * lm_ggml_backend_cpu_buffer_get_base(lm_ggml_backend_buffer_t buffer) { return (void *)buffer->context; } static void lm_ggml_backend_cpu_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { free(buffer->context); - LM_GGML_UNUSED(buffer); } static void lm_ggml_backend_cpu_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); LM_GGML_UNUSED(buffer); } static void lm_ggml_backend_cpu_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); LM_GGML_UNUSED(buffer); } -static void lm_ggml_backend_cpu_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); +static bool lm_ggml_backend_cpu_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + if (lm_ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, lm_ggml_nbytes(src)); + return true; + } + return false; LM_GGML_UNUSED(buffer); } -static void lm_ggml_backend_cpu_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src)); - - LM_GGML_UNUSED(buffer); +static void lm_ggml_backend_cpu_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); } static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i = { + /* .get_name = */ lm_ggml_backend_cpu_buffer_name, /* .free_buffer = */ lm_ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ lm_ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // for buffers from ptr, free is not called static struct lm_ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { + /* .get_name = */ lm_ggml_backend_cpu_buffer_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ lm_ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ lm_ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ lm_ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ lm_ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ lm_ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ lm_ggml_backend_cpu_buffer_cpy_tensor, + /* .clear = */ lm_ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 +static const char * lm_ggml_backend_cpu_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) { + return "CPU"; + + LM_GGML_UNUSED(buft); +} + static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use LM_GGML_ALIGNED_MALLOC? @@ -455,19 +533,83 @@ static bool lm_ggml_backend_cpu_buffer_type_supports_backend(lm_ggml_backend_buf LM_GGML_UNUSED(buft); } +static bool lm_ggml_backend_cpu_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) { + return true; + + LM_GGML_UNUSED(buft); +} + lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void) { - static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_cpu = { + static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type = { /* .iface = */ { + /* .get_name = */ lm_ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ lm_ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; - return &lm_ggml_backend_buffer_type_cpu; + return &lm_ggml_backend_cpu_buffer_type; +} + +#ifdef LM_GGML_USE_CPU_HBM + +// buffer type HBM + +#include + +static const char * lm_ggml_backend_cpu_hbm_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + LM_GGML_UNUSED(buft); +} + +static const char * lm_ggml_backend_cpu_hbm_buffer_get_name(lm_ggml_backend_buffer_t buf) { + return "CPU_HBM"; + + LM_GGML_UNUSED(buf); +} + +static void lm_ggml_backend_cpu_hbm_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { + //void * ptr = hbw_malloc(size); + void * ptr; + int result = hbw_posix_memalign(&ptr, lm_ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + lm_ggml_backend_buffer_t buffer = lm_ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = lm_ggml_backend_cpu_hbm_buffer_get_name; + buffer->iface.free_buffer = lm_ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) { + static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .get_name = */ lm_ggml_backend_cpu_hbm_buffer_type_get_name, + /* .alloc_buffer = */ lm_ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ lm_ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes + /* .supports_backend = */ lm_ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ lm_ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ NULL, + }; + + return &lm_ggml_backend_cpu_buffer_type_hbm; } +#endif struct lm_ggml_backend_cpu_context { int n_threads; @@ -499,13 +641,13 @@ struct lm_ggml_backend_plan_cpu { struct lm_ggml_cgraph cgraph; }; -static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { +static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_create(lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph) { struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context; struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu)); cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads); - cpu_plan->cgraph = *cgraph; + cpu_plan->cgraph = *cgraph; // FIXME: deep copy if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); @@ -531,7 +673,7 @@ static void lm_ggml_backend_cpu_graph_plan_compute(lm_ggml_backend_t backend, lm LM_GGML_UNUSED(backend); } -static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { +static bool lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context; struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -545,13 +687,18 @@ static void lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct cplan.work_data = cpu_ctx->work_data; lm_ggml_graph_compute(cgraph, &cplan); + return true; } static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { - return true; + switch (op->op) { + case LM_GGML_OP_MUL_MAT: + return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + default: + return true; + } LM_GGML_UNUSED(backend); - LM_GGML_UNUSED(op); } static struct lm_ggml_backend_i cpu_backend_i = { @@ -560,8 +707,7 @@ static struct lm_ggml_backend_i cpu_backend_i = { /* .get_default_buffer_type = */ lm_ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ lm_ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ lm_ggml_backend_cpu_graph_plan_free, @@ -587,7 +733,7 @@ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) { } bool lm_ggml_backend_is_cpu(lm_ggml_backend_t backend) { - return backend->iface.get_name == lm_ggml_backend_cpu_name; + return backend && backend->iface.get_name == lm_ggml_backend_cpu_name; } void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_threads) { @@ -611,7 +757,7 @@ static lm_ggml_backend_t lm_ggml_backend_reg_cpu_init(const char * params, void // scheduler -#define LM_GGML_MAX_BACKENDS 4 +#define LM_GGML_MAX_BACKENDS 16 #define LM_GGML_MAX_SPLITS 256 #define LM_GGML_MAX_SPLIT_INPUTS 16 @@ -621,21 +767,29 @@ struct lm_ggml_backend_sched_split { int i_end; struct lm_ggml_tensor * inputs[LM_GGML_MAX_SPLIT_INPUTS]; int n_inputs; + // graph view of this split struct lm_ggml_cgraph graph; }; struct lm_ggml_backend_sched { + bool is_reset; // true if the scheduler has been reset since the last graph split + int n_backends; lm_ggml_backend_t backends[LM_GGML_MAX_BACKENDS]; + lm_ggml_backend_buffer_type_t bufts[LM_GGML_MAX_BACKENDS]; lm_ggml_tallocr_t tallocs[LM_GGML_MAX_BACKENDS]; lm_ggml_gallocr_t galloc; + // hash keys of the nodes in the graph struct lm_ggml_hash_set hash_set; - lm_ggml_tallocr_t * node_talloc; // [hash_set.size] - struct lm_ggml_tensor * (* node_copies)[LM_GGML_MAX_BACKENDS]; // [hash_set.size][LM_GGML_MAX_BACKENDS] + // hash values (arrays of [hash_set.size]) + lm_ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) + struct lm_ggml_tensor * (* node_copies)[LM_GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // copy of the graph with modified inputs struct lm_ggml_cgraph * graph; + struct lm_ggml_backend_sched_split splits[LM_GGML_MAX_SPLITS]; int n_splits; @@ -676,14 +830,22 @@ static int sched_allocr_prio(lm_ggml_backend_sched_t sched, lm_ggml_tallocr_t al return INT_MAX; } -static lm_ggml_backend_t get_buffer_backend(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) { +static lm_ggml_tallocr_t sched_allocr_from_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_buffer_t buffer) { if (buffer == NULL) { return NULL; } + + // check if this is already allocate in a allocr buffer (from user manual allocations) + for (int i = 0; i < sched->n_backends; i++) { + if (lm_ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { + return sched->tallocs[i]; + } + } + // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (lm_ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->backends[i]; + return sched->tallocs[i]; } } LM_GGML_ASSERT(false && "tensor buffer type not supported by any backend"); @@ -693,7 +855,6 @@ static lm_ggml_backend_t get_allocr_backend(lm_ggml_backend_sched_t sched, lm_gg if (allocr == NULL) { return NULL; } - // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (sched->tallocs[i] == allocr) { return sched->backends[i]; @@ -703,7 +864,7 @@ static lm_ggml_backend_t get_allocr_backend(lm_ggml_backend_sched_t sched, lm_gg } #if 0 -static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*8 + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -712,45 +873,37 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*8 + LM_GGML_MAX_SPLITS*LM_GGML_MAX #endif // returns the backend that should be used for the node based on the current locations -static lm_ggml_backend_t sched_backend_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { - // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there - // ie. kv cache updates - // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. +static lm_ggml_tallocr_t sched_allocr_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { + // assign pre-allocated nodes to their backend // dst - lm_ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer); - if (cur_backend != NULL) { + lm_ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); + if (cur_allocr != NULL) { SET_CAUSE(node, "1.dst"); - return cur_backend; + return cur_allocr; } - // view_src - if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) { - SET_CAUSE(node, "1.vsrc"); - return get_buffer_backend(sched, node->view_src->buffer); + if (node->view_src != NULL) { + cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); + if (cur_allocr != NULL) { + SET_CAUSE(node, "1.vsrc"); + return cur_allocr; + } } - - // src - int cur_prio = INT_MAX; - size_t cur_size = 0; - + // assign nodes that use weights to the backend of the weights for (int i = 0; i < LM_GGML_MAX_SRC; i++) { const struct lm_ggml_tensor * src = node->src[i]; if (src == NULL) { break; } - lm_ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer); - if (src_backend != NULL) { - int src_prio = sched_backend_prio(sched, src_backend); - size_t src_size = lm_ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - cur_backend = src_backend; - SET_CAUSE(node, "1.src%d", i); - } + if (src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + lm_ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + // operations with weights are always run on the same backend as the weights + SET_CAUSE(node, "1.wgt%d", i); + return src_allocr; } } - return cur_backend; + + return NULL; } static char * fmt_size(size_t size) { @@ -783,7 +936,7 @@ static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggm } lm_ggml_tallocr_t node_allocr = node_allocr(node); lm_ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, lm_ggml_op_name(node->op), node->name, + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, lm_ggml_op_name(node->op), node->name, fmt_size(lm_ggml_nbytes(node)), node_allocr ? lm_ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; @@ -792,7 +945,7 @@ static void sched_print_assignments(lm_ggml_backend_sched_t sched, struct lm_ggm } lm_ggml_tallocr_t src_allocr = node_allocr(src); lm_ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; - fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, + fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(lm_ggml_nbytes(src)), src_backend ? lm_ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } fprintf(stderr, "\n"); @@ -808,15 +961,17 @@ static struct lm_ggml_tensor * lm_ggml_dup_tensor_layout(struct lm_ggml_context return dup; } + +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -// TODO: merge passes static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { - // reset state - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + // reset splits sched->n_splits = 0; + sched->is_reset = false; struct lm_ggml_init_params params = { /* .mem_size = */ sizeof(sched->context_buffer), @@ -824,26 +979,22 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra /* .no_alloc = */ true }; - if (sched->ctx != NULL) { - lm_ggml_free(sched->ctx); - } + lm_ggml_free(sched->ctx); sched->ctx = lm_ggml_init(params); + if (sched->ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + LM_GGML_ASSERT(false); + } - // pass 1: assign backends to ops with allocated inputs + // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct lm_ggml_tensor * leaf = graph->leafs[i]; if (node_allocr(leaf) != NULL) { // do not overwrite user assignments continue; } - lm_ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer); - if (leaf_backend == NULL && leaf->view_src != NULL) { - leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer); - } - if (leaf_backend != NULL) { - node_allocr(leaf) = lm_ggml_backend_sched_get_tallocr(sched, leaf_backend); - } + node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { @@ -852,50 +1003,120 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra // do not overwrite user assignments continue; } - lm_ggml_backend_t node_backend = sched_backend_from_cur(sched, node); - if (node_backend != NULL) { - node_allocr(node) = lm_ggml_backend_sched_get_tallocr(sched, node_backend); + node_allocr(node) = sched_allocr_from_cur(sched, node); + // src + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (node_allocr(src) == NULL) { + node_allocr(src) = sched_allocr_from_cur(sched, src); + } } } - //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS1 + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 2: assign backends to ops from current assignments - // TODO: - // - reuse sched_backend_from_cur - for (int i = 0; i < graph->n_nodes; i++) { - struct lm_ggml_tensor * node = graph->nodes[i]; - lm_ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { - int cur_prio = INT_MAX; - size_t cur_size = 0; - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; + // pass 2: expand current backend assignments + // assign the same backend to adjacent nodes + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) + // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops + + // pass 2.1 expand gpu up + { + lm_ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (lm_ggml_is_view_op(node->op)) { + continue; + } + lm_ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != NULL) { - int src_prio = sched_allocr_prio(sched, src_allocr); - size_t src_size = lm_ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - node_allocr = src_allocr; - SET_CAUSE(node, "2.src%d", j); - } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.1"); + } + } + } + + // pass 2.2 expand gpu down + { + lm_ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (lm_ggml_is_view_op(node->op)) { + continue; + } + lm_ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.2"); + } + } + } + + // pass 2.3 expand rest up + { + lm_ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (lm_ggml_is_view_op(node->op)) { + continue; + } + lm_ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.3"); + } + } + } + + // pass 2.4 expand rest down + { + lm_ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (lm_ggml_is_view_op(node->op)) { + continue; } + lm_ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { - node_allocr(node) = node_allocr; + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.4"); } } } - //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS2 + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 3: assign backends to remaining src from dst (should only be leafs) + // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; - lm_ggml_tallocr_t node_allocr = node_allocr(node); + lm_ggml_tallocr_t cur_allocr = node_allocr(node); + if (node->view_src != NULL && cur_allocr == NULL) { + cur_allocr = node_allocr(node) = node_allocr(node->view_src); + SET_CAUSE(node, "3.vsrc"); + } for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -903,81 +1124,107 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra } lm_ggml_tallocr_t src_allocr = node_allocr(src); if (src_allocr == NULL) { - node_allocr(src) = node_allocr; + if (src->view_src != NULL) { + // views are always on the same backend as the source + node_allocr(src) = node_allocr(src->view_src); + SET_CAUSE(src, "3.vsrc"); + } else { + node_allocr(src) = cur_allocr; + SET_CAUSE(src, "3.cur"); + } } } } - //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS3 + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 4: split graph, find tensors that need to be copied - // TODO: - // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost - // find first backend - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - struct lm_ggml_tensor * node = graph->nodes[i]; - if (node->view_src == NULL) { - sched->splits[0].tallocr = node_allocr(node); - break; + { + int cur_split = 0; + // find the backend of the first split, skipping view ops + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + if (!lm_ggml_is_view_op(node->op)) { + sched->splits[0].tallocr = node_allocr(node); + break; + } } - } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - lm_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); - for (int i = 0; i < graph->n_nodes; i++) { - struct lm_ggml_tensor * node = graph->nodes[i]; + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + lm_ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct lm_ggml_tensor * node = graph->nodes[i]; + + if (lm_ggml_is_view_op(node->op)) { + continue; + } - if (lm_ggml_is_view_op(node->op)) { - continue; - } + lm_ggml_tallocr_t node_allocr = node_allocr(node); - lm_ggml_tallocr_t node_allocr = node_allocr(node); + LM_GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now - if (node_allocr != cur_allocr) { - sched->splits[cur_split].i_end = i; - cur_split++; - LM_GGML_ASSERT(cur_split < LM_GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); - } - - // find inputs that are not on the same backend - for (int j = 0; j < LM_GGML_MAX_SRC; j++) { - struct lm_ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + LM_GGML_ASSERT(cur_split < LM_GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); } - lm_ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr) { - int n_inputs = sched->splits[cur_split].n_inputs++; - LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = (struct lm_ggml_tensor *)src; - - // create copies - size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; - lm_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); - lm_ggml_format_name(tensor_copy, "%s#%s", lm_ggml_backend_name(backend), src->name); + + // find inputs that are not on the same backend + for (int j = 0; j < LM_GGML_MAX_SRC; j++) { + struct lm_ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + lm_ggml_tallocr_t src_allocr = node_allocr(src); + LM_GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now + if (src_allocr != node_allocr) { + // check if the input is already in the split + bool found = false; + for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { + if (sched->splits[cur_split].inputs[k] == src) { + found = true; + break; + } + } + + if (!found) { + int n_inputs = sched->splits[cur_split].n_inputs++; + //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, lm_ggml_backend_name(get_allocr_backend(sched, src_allocr))); + LM_GGML_ASSERT(n_inputs < LM_GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = src; + } + + // create a copy of the input in the split's backend + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + lm_ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + struct lm_ggml_tensor * tensor_copy = lm_ggml_dup_tensor_layout(sched->ctx, src); + lm_ggml_format_name(tensor_copy, "%s#%s", lm_ggml_backend_name(backend), src->name); + + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + SET_CAUSE(tensor_copy, "4.cpy"); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; } - node->src[j] = sched->node_copies[id][cur_backend_id]; } } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; - - //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); +#ifdef DEBUG_PASS4 + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif -#if 1 +#ifndef NDEBUG // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; @@ -985,6 +1232,11 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra if (node_allocr == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } + if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", + node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + node->view_src->name, node_allocr(node->view_src) ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + } for (int j = 0; j < LM_GGML_MAX_SRC; j++) { struct lm_ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -996,8 +1248,14 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra node->name, node_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", j, src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); } + if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", + src->name, src_allocr ? lm_ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", + src->view_src->name, node_allocr(src->view_src) ? lm_ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + } } } + fflush(stderr); #endif // create copies of the graph for each split @@ -1011,6 +1269,8 @@ static void sched_split_graph(lm_ggml_backend_sched_t sched, struct lm_ggml_cgra for (int j = 0; j < split->n_inputs; j++) { struct lm_ggml_tensor * input = split->inputs[j]; struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + // add a dependency to the input source so that it is not freed before the copy is done + LM_GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); input_cpy->src[0] = input; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } @@ -1045,24 +1305,16 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { uint64_t copy_start_us = lm_ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct lm_ggml_tensor * input = split->inputs[j]; - struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; - if (input->buffer == NULL) { - if (input->view_src == NULL) { - fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); - exit(1); - } - // FIXME: may need to use the sched buffer instead - lm_ggml_backend_view_init(input->view_src->buffer, input); - } - if (input_cpy->buffer == NULL) { - fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); - exit(1); - } - //LM_GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); - //LM_GGML_ASSERT(input_cpy->buffer->backend == split_backend); - lm_ggml_backend_tensor_copy(input, input_cpy); + struct lm_ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + + LM_GGML_ASSERT(input->buffer != NULL); + LM_GGML_ASSERT(input_cpy->buffer != NULL); + + // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change + // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times + lm_ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } - // lm_ggml_backend_synchronize(split_backend); + //lm_ggml_backend_synchronize(split_backend); // necessary to measure copy time int64_t copy_end_us = lm_ggml_time_us(); copy_us[split_backend_id] += copy_end_us - copy_start_us; @@ -1074,7 +1326,7 @@ static void sched_compute_splits(lm_ggml_backend_sched_t sched) { uint64_t compute_start_us = lm_ggml_time_us(); lm_ggml_backend_graph_compute(split_backend, &split->graph); - // lm_ggml_backend_synchronize(split_backend); + //lm_ggml_backend_synchronize(split_backend); // necessary to measure compute time uint64_t compute_end_us = lm_ggml_time_us(); compute_us[split_backend_id] += compute_end_us - compute_start_us; } @@ -1094,26 +1346,41 @@ static void sched_reset(lm_ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_backends; i++) { lm_ggml_tallocr_reset(sched->tallocs[i]); } + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + + sched->is_reset = true; } -lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends) { +lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { + LM_GGML_ASSERT(n_backends > 0); LM_GGML_ASSERT(n_backends <= LM_GGML_MAX_BACKENDS); - struct lm_ggml_backend_sched * sched = malloc(sizeof(struct lm_ggml_backend_sched)); - memset(sched, 0, sizeof(struct lm_ggml_backend_sched)); + struct lm_ggml_backend_sched * sched = calloc(sizeof(struct lm_ggml_backend_sched), 1); + + // initialize hash table + sched->hash_set = lm_ggml_hash_set_new(graph_size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); + sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); + sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { sched->backends[i] = backends[i]; + sched->bufts[i] = bufts ? bufts[i] : lm_ggml_backend_get_default_buffer_type(backends[i]); } sched->galloc = lm_ggml_gallocr_new(); // init measure allocs for each backend for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = lm_ggml_tallocr_new_measure_from_backend(backends[i]); + sched->tallocs[i] = lm_ggml_tallocr_new_measure_from_buft(sched->bufts[i]); } + sched_reset(sched); + return sched; } @@ -1125,6 +1392,7 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) { lm_ggml_tallocr_free(sched->tallocs[i]); } lm_ggml_gallocr_free(sched->galloc); + lm_ggml_free(sched->ctx); free(sched->hash_set.keys); free(sched->node_talloc); free(sched->node_copies); @@ -1132,12 +1400,7 @@ void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched) { } void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph) { - // initialize hash tables - size_t hash_size = measure_graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS; - sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + LM_GGML_ASSERT(lm_ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); @@ -1146,28 +1409,41 @@ void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm for (int i = 0; i < sched->n_backends; i++) { size_t size = lm_ggml_tallocr_max_size(sched->tallocs[i]); lm_ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = lm_ggml_tallocr_new_from_backend(sched->backends[i], size); + sched->tallocs[i] = lm_ggml_tallocr_new_from_buft(sched->bufts[i], size); } sched_reset(sched); } void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph) { - LM_GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); + LM_GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + LM_GGML_MAX_SPLITS*LM_GGML_MAX_SPLIT_INPUTS); + + if (!sched->is_reset) { + sched_reset(sched); + } sched_split_graph(sched, graph); sched_alloc_splits(sched); sched_compute_splits(sched); +} + +void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched) { sched_reset(sched); } +int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched) { + return sched->n_splits; +} + lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return sched->tallocs[backend_index]; } lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + LM_GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return lm_ggml_tallocr_get_buffer(sched->tallocs[backend_index]); } @@ -1177,10 +1453,19 @@ void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struc node_allocr(node) = sched->tallocs[backend_index]; } +lm_ggml_backend_t lm_ggml_backend_sched_get_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node) { + lm_ggml_tallocr_t allocr = node_allocr(node); + if (allocr == NULL) { + return NULL; + } + return get_allocr_backend(sched, allocr); +} + // utils + void lm_ggml_backend_view_init(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) { LM_GGML_ASSERT(tensor->buffer == NULL); - LM_GGML_ASSERT(tensor->data == NULL); + //LM_GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in lm_ggml_new_tensor, but still need to be initialized by the backend LM_GGML_ASSERT(tensor->view_src != NULL); LM_GGML_ASSERT(tensor->view_src->buffer != NULL); LM_GGML_ASSERT(tensor->view_src->data != NULL); @@ -1246,6 +1531,7 @@ static void graph_init_tensor(struct lm_ggml_hash_set hash_set, struct lm_ggml_t struct lm_ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { + graph_init_tensor(hash_set, node_copies, node_init, src->view_src); lm_ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1279,6 +1565,21 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b struct lm_ggml_context * ctx_allocated = lm_ggml_init(params); struct lm_ggml_context * ctx_unallocated = lm_ggml_init(params); + if (ctx_allocated == NULL || ctx_unallocated == NULL) { + fprintf(stderr, "failed to allocate context for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + lm_ggml_free(ctx_allocated); + lm_ggml_free(ctx_unallocated); + return (struct lm_ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } + // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct lm_ggml_tensor * node = graph->nodes[i]; @@ -1287,6 +1588,20 @@ struct lm_ggml_backend_graph_copy lm_ggml_backend_graph_copy(lm_ggml_backend_t b // allocate nodes lm_ggml_backend_buffer_t buffer = lm_ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); + if (buffer == NULL) { + fprintf(stderr, "failed to allocate buffer for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + lm_ggml_free(ctx_allocated); + lm_ggml_free(ctx_unallocated); + return (struct lm_ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } //printf("copy buffer size: %zu MB\n", lm_ggml_backend_buffer_get_size(buffer) / 1024 / 1024); @@ -1323,8 +1638,12 @@ void lm_ggml_backend_graph_copy_free(struct lm_ggml_backend_graph_copy copy) { lm_ggml_free(copy.ctx_unallocated); } -void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) { +bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data) { struct lm_ggml_backend_graph_copy copy = lm_ggml_backend_graph_copy(backend2, graph); + if (copy.buffer == NULL) { + return false; + } + struct lm_ggml_cgraph * g1 = graph; struct lm_ggml_cgraph * g2 = copy.graph; @@ -1354,4 +1673,6 @@ void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_b } lm_ggml_backend_graph_copy_free(copy); + + return true; } diff --git a/cpp/ggml-backend.h b/cpp/ggml-backend.h index 63fa013e..d6910fb9 100644 --- a/cpp/ggml-backend.h +++ b/cpp/ggml-backend.h @@ -17,19 +17,31 @@ extern "C" { // // buffer type - LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size); - LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft); - LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size(lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); - LM_GGML_API bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); + LM_GGML_API const char * lm_ggml_backend_buft_name (lm_ggml_backend_buffer_type_t buft); + LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_buft_alloc_buffer (lm_ggml_backend_buffer_type_t buft, size_t size); + LM_GGML_API size_t lm_ggml_backend_buft_get_alignment (lm_ggml_backend_buffer_type_t buft); + LM_GGML_API size_t lm_ggml_backend_buft_get_alloc_size (lm_ggml_backend_buffer_type_t buft, struct lm_ggml_tensor * tensor); + LM_GGML_API bool lm_ggml_backend_buft_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend); + LM_GGML_API bool lm_ggml_backend_buft_is_host (lm_ggml_backend_buffer_type_t buft); // buffer - LM_GGML_API void lm_ggml_backend_buffer_free (lm_ggml_backend_buffer_t buffer); - LM_GGML_API void * lm_ggml_backend_buffer_get_base (lm_ggml_backend_buffer_t buffer); - LM_GGML_API size_t lm_ggml_backend_buffer_get_size (lm_ggml_backend_buffer_t buffer); - LM_GGML_API void lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); - LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer); - LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); - LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_type(lm_ggml_backend_buffer_t buffer); + enum lm_ggml_backend_buffer_usage { + LM_GGML_BACKEND_BUFFER_USAGE_ANY = 0, + LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, + }; + + LM_GGML_API const char * lm_ggml_backend_buffer_name (lm_ggml_backend_buffer_t buffer); + LM_GGML_API void lm_ggml_backend_buffer_free (lm_ggml_backend_buffer_t buffer); + LM_GGML_API void * lm_ggml_backend_buffer_get_base (lm_ggml_backend_buffer_t buffer); + LM_GGML_API size_t lm_ggml_backend_buffer_get_size (lm_ggml_backend_buffer_t buffer); + LM_GGML_API void lm_ggml_backend_buffer_init_tensor (lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + LM_GGML_API size_t lm_ggml_backend_buffer_get_alignment (lm_ggml_backend_buffer_t buffer); + LM_GGML_API size_t lm_ggml_backend_buffer_get_alloc_size(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor); + LM_GGML_API void lm_ggml_backend_buffer_clear (lm_ggml_backend_buffer_t buffer, uint8_t value); + LM_GGML_API bool lm_ggml_backend_buffer_is_host (lm_ggml_backend_buffer_t buffer); + LM_GGML_API void lm_ggml_backend_buffer_set_usage (lm_ggml_backend_buffer_t buffer, enum lm_ggml_backend_buffer_usage usage); + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_buffer_get_type (lm_ggml_backend_buffer_t buffer); + LM_GGML_API void lm_ggml_backend_buffer_reset (lm_ggml_backend_buffer_t buffer); // // Backend @@ -55,7 +67,7 @@ extern "C" { LM_GGML_API void lm_ggml_backend_graph_plan_free (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); LM_GGML_API void lm_ggml_backend_graph_plan_compute(lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan); - LM_GGML_API void lm_ggml_backend_graph_compute (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); + LM_GGML_API bool lm_ggml_backend_graph_compute (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph); LM_GGML_API bool lm_ggml_backend_supports_op (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op); // tensor copy between different backends @@ -76,6 +88,10 @@ extern "C" { LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_buffer_type(void); +#ifdef LM_GGML_USE_CPU_HBM + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void); +#endif + // // Backend registry // @@ -133,23 +149,24 @@ extern "C" { typedef struct lm_ggml_backend_sched * lm_ggml_backend_sched_t; // Initialize a backend scheduler - LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, int n_backends); - - LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched); - + LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - LM_GGML_API void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph); + LM_GGML_API void lm_ggml_backend_sched_init_measure(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * measure_graph); + // Get the number of splits of the last graph + LM_GGML_API int lm_ggml_backend_sched_get_n_splits(lm_ggml_backend_sched_t sched); LM_GGML_API lm_ggml_tallocr_t lm_ggml_backend_sched_get_tallocr(lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_sched_get_buffer (lm_ggml_backend_sched_t sched, lm_ggml_backend_t backend); - LM_GGML_API void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend); + LM_GGML_API void lm_ggml_backend_sched_set_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node, lm_ggml_backend_t backend); + LM_GGML_API lm_ggml_backend_t lm_ggml_backend_sched_get_node_backend(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * node); - // Allocate a graph on the backend scheduler - LM_GGML_API void lm_ggml_backend_sched_graph_compute( - lm_ggml_backend_sched_t sched, - struct lm_ggml_cgraph * graph); + // Allocate and compute graph on the backend scheduler + LM_GGML_API void lm_ggml_backend_sched_graph_compute(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph); + // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched); // // Utils @@ -169,7 +186,7 @@ extern "C" { typedef bool (*lm_ggml_backend_eval_callback)(int node_index, struct lm_ggml_tensor * t1, struct lm_ggml_tensor * t2, void * user_data); // Compare the output of two backends - LM_GGML_API void lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data); + LM_GGML_API bool lm_ggml_backend_compare_graph_backend(lm_ggml_backend_t backend1, lm_ggml_backend_t backend2, struct lm_ggml_cgraph * graph, lm_ggml_backend_eval_callback callback, void * user_data); // Tensor initialization LM_GGML_API void lm_ggml_backend_tensor_alloc(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, void * addr); diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h index 998c0ba6..38380f5c 100644 --- a/cpp/ggml-impl.h +++ b/cpp/ggml-impl.h @@ -5,6 +5,7 @@ // GGML internal header #include +#include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include #include #include // memcpy @@ -227,6 +228,8 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) { #define LM_GGML_HASHTABLE_FULL ((size_t)-1) #define LM_GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size); + bool lm_ggml_hash_contains (const struct lm_ggml_hash_set hash_set, struct lm_ggml_tensor * key); // returns LM_GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted diff --git a/cpp/ggml-metal-llama.metal b/cpp/ggml-metal-llama.metal index d5b54e11..029578dc 100644 --- a/cpp/ggml-metal-llama.metal +++ b/cpp/ggml-metal-llama.metal @@ -59,26 +59,26 @@ kernel void kernel_add( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, constant int64_t & offs, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], @@ -109,26 +109,26 @@ kernel void kernel_mul( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -158,26 +158,26 @@ kernel void kernel_div( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { @@ -205,7 +205,7 @@ kernel void kernel_add_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig % nb]; } @@ -214,7 +214,7 @@ kernel void kernel_mul_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig % nb]; } @@ -223,7 +223,7 @@ kernel void kernel_div_row( device const float4 * src0, device const float4 * src1, device float4 * dst, - constant int64_t & nb [[buffer(28)]], + constant uint64_t & nb [[buffer(28)]], uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] / src1[tpig % nb]; } @@ -307,26 +307,26 @@ kernel void kernel_sum_rows( constant int64_t & ne01, constant int64_t & ne02, constant int64_t & ne03, - constant int64_t & nb00, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & nb03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, constant int64_t & ne10, constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & nb13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, constant int64_t & ne0, constant int64_t & ne1, constant int64_t & ne2, constant int64_t & ne3, - constant int64_t & nb0, - constant int64_t & nb1, - constant int64_t & nb2, - constant int64_t & nb3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, uint3 tpig[[thread_position_in_grid]]) { int64_t i3 = tpig.z; int64_t i2 = tpig.y; @@ -846,7 +846,7 @@ inline float block_q_n_dot_y(device const block_q5_1 * qb_curr, float sumy, thre #define N_SIMDGROUP 2 // number of SIMD groups in a thread group //Note: This is a template, but strictly speaking it only applies to // quantizations where the block size is 32. It also does not -// giard against the number of rows not being divisible by +// guard against the number of rows not being divisible by // N_DST, so this is another explicit assumption of the implementation. template void mul_vec_q_n_f32_impl( @@ -920,14 +920,21 @@ kernel void kernel_mul_mv_q4_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -939,14 +946,21 @@ kernel void kernel_mul_mv_q4_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -958,14 +972,21 @@ kernel void kernel_mul_mv_q5_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -977,14 +998,21 @@ kernel void kernel_mul_mv_q5_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1071,12 +1099,19 @@ kernel void kernel_mul_mv_q8_0_f32( constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne10, + constant int64_t & ne11, constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1182,8 +1217,8 @@ kernel void kernel_mul_mv_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f32_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1209,8 +1244,8 @@ kernel void kernel_mul_mv_f16_f16( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1346,8 +1381,8 @@ kernel void kernel_mul_mv_f16_f32_1row( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f16_f32_1row_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1452,8 +1487,8 @@ kernel void kernel_mul_mv_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { kernel_mul_mv_f16_f32_impl(src0, src1, dst, ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1, r2, r3, tgpig, tiisg); @@ -1478,8 +1513,8 @@ kernel void kernel_mul_mv_f16_f32_l4( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]]) { @@ -1543,7 +1578,8 @@ kernel void kernel_alibi_f32( const int64_t i3 = n / (ne2*ne1*ne0); const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; - const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + //const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + const int64_t k = i3*ne3 + i2; float m_k; @@ -2410,21 +2446,18 @@ typedef struct { } block_q6_K; // 210 bytes / block -static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { - uchar4 r; - if (j < 4) { - r[0] = q[j+0] & 63; - r[2] = q[j+1] & 63; - r[1] = q[j+4] & 63; - r[3] = q[j+5] & 63; - } else { - r[0] = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - r[2] = (q[j+5] & 0xF) | ((q[j-3] >> 6) << 4); - r[1] = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - r[3] = (q[j+5] >> 4) | ((q[j+1] >> 6) << 4); - } - return r; -} +typedef struct { + half d; + uint16_t qs[QK_K/8]; +} block_iq2_xxs; +// 66 bytes / block for QK_K = 256, so 2.0625 bpw + +typedef struct { + half d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +// 74 bytes / block for QK_K = 256, so 2.3125 bpw //====================================== dot products ========================= @@ -2584,14 +2617,21 @@ kernel void kernel_mul_mv_q2_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2841,14 +2881,21 @@ kernel void kernel_mul_mv_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -2984,8 +3031,8 @@ void kernel_mul_mv_q4_K_f32_impl( constant uint & r2, constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int ix = tiisg/4; // 0...7 const int it = tiisg%4; // 0...3 @@ -2994,7 +3041,7 @@ void kernel_mul_mv_q4_K_f32_impl( const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; - const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int first_row = r0 * N_DST; const int ib_row = first_row * nb; const uint i12 = im%ne12; @@ -3060,7 +3107,7 @@ void kernel_mul_mv_q4_K_f32_impl( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0+ im*ne0*ne1 + first_row + row] = all_sum; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -3072,14 +3119,21 @@ kernel void kernel_mul_mv_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3271,14 +3325,21 @@ kernel void kernel_mul_mv_q5_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3398,14 +3459,21 @@ kernel void kernel_mul_mv_q6_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne01[[buffer(4)]], - constant int64_t & ne02[[buffer(5)]], - constant int64_t & ne10[[buffer(9)]], - constant int64_t & ne12[[buffer(11)]], - constant int64_t & ne0 [[buffer(15)]], - constant int64_t & ne1 [[buffer(16)]], - constant uint & r2 [[buffer(17)]], - constant uint & r3 [[buffer(18)]], + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3413,53 +3481,542 @@ kernel void kernel_mul_mv_q6_K_f32( kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); } -//============================= templates and their specializations ============================= +// ======================= "True" 2-bit + +constexpr constant static uint64_t iq2xxs_grid[256] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, + 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, + 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, + 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, + 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, + 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, + 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, + 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, + 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, + 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, + 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, + 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, + 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, + 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, + 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, + 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, + 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, + 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, + 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, + 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, + 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, + 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, + 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, + 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, + 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, + 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, + 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, + 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, + 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, + 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, + 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, + 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, + 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, + 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, + 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, + 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, + 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, + 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, + 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, + 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, + 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, + 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, + 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, + 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, + 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, + 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, + 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, + 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, + 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, + 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, + 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, + 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, + 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, + 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, + 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, + 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, + 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, + 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, + 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, + 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, + 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, +}; -// NOTE: this is not dequantizing - we are simply fitting the template -template -void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { - float4x4 temp = *(((device float4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} +constexpr constant static uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; -template -void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { - half4x4 temp = *(((device half4x4 *)src)); - for (int i = 0; i < 16; i++){ - reg[i/4][i%4] = temp[i/4][i%4]; - } -} +constexpr constant static uint8_t ksigns_iq2xs[128] = { + 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, + 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, + 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, + 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, + 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, + 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, + 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, + 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, +}; -template -void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 1); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float md = -8.h * xb->d; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; +constexpr constant static uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; - } -} +void kernel_mul_mv_iq2_xxs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { -template -void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { - device const uint16_t * qs = ((device const uint16_t *)xb + 2); - const float d1 = il ? (xb->d / 16.h) : xb->d; - const float d2 = d1 / 256.f; - const float m = xb->m; - const ushort mask0 = il ? 0x00F0 : 0x000F; - const ushort mask1 = mask0 << 8; + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; - for (int i=0;i<8;i++) { - reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; - } + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq2_xxs * x = (device const block_iq2_xxs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256); + { + int nval = 4; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + +#if QK_K == 256 + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xxs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + device const uint8_t * aux8 = (device const uint8_t *)q2; + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = db * (0.5f + (aux32 >> 28)); + + float sum = 0; + for (int l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + aux8[l]); + const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sum += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d * sum; + + dh += nb*sizeof(block_iq2_xxs)/2; + q2 += nb*sizeof(block_iq2_xxs)/2; + } + + y4 += 32 * 32; + } +#else + // TODO +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xxs_f32")]] +kernel void kernel_mul_mv_iq2_xxs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + +void kernel_mul_mv_iq2_xs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 512); + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + +#if QK_K == 256 + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const uint8_t * sc = xr->scales + ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const uint8_t ls1 = sc[0] & 0xf; + const uint8_t ls2 = sc[0] >> 4; + const float d1 = db * (0.5f + ls1); + const float d2 = db * (0.5f + ls2); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < 2; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + for (int l = 2; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d1 * sum1 + d2 * sum2; + + dh += nb*sizeof(block_iq2_xs)/2; + q2 += nb*sizeof(block_iq2_xs)/2; + sc += nb*sizeof(block_iq2_xs); + } + + y4 += 32 * 32; + } +#else + // TODO +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xs_f32")]] +kernel void kernel_mul_mv_iq2_xs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + +//============================= templates and their specializations ============================= + +// NOTE: this is not dequantizing - we are simply fitting the template +template +void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { + float4x4 temp = *(((device float4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; + } +} + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float md = -8.h * xb->d; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md; + reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const float d1 = il ? (xb->d / 16.h) : xb->d; + const float d2 = d1 / 256.f; + const float m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = mask0 << 8; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)+0] = ((qs[i] & mask0) * d1) + m; + reg[i/2][2*(i%2)+1] = ((qs[i] & mask1) * d2) + m; + } } template @@ -3523,7 +4080,7 @@ void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg device const int8_t * qs = ((device const int8_t *)xb->qs); const half d = xb->d; - for (int i=0;i<16;i++) { + for (int i = 0; i < 16; i++) { reg[i/4][i%4] = (qs[i + 16*il] * d); } } @@ -3565,8 +4122,8 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : (scale_2&kmask2) | ((scale_1&kmask1) << 4); - half dl = il<8 ? d_all * (dl_int - 32.h) : d_all * (dl_int / 16.h - 32.h); - const half ml = 4.h * dl; + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + const float ml = 4.f * dl; il = (il/2) & 3; const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); @@ -3633,7 +4190,7 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg uint8_t ul = 1 << (il/2); il = il & 3; const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales); - const float d = il < 2 ? xb->d : xb->d / 16.h; + const float d = il < 2 ? xb->d : xb->d / 16.f; const float min = xb->dmin; const float dl = d * sc[0]; const float ml = min * sc[1]; @@ -3666,17 +4223,17 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg #if QK_K == 256 ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); qh = qh + 32*(il/8) + 16*(il&1); - half sc = scales[(il%2) + 2 * ((il/2))]; + float sc = scales[(il%2) + 2 * ((il/2))]; il = (il/2) & 3; #else ql = ql + 16 * (il&1); - half sc = scales[il]; + float sc = scales[il]; #endif const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; - const half coef = il>1 ? 1.f/16.h : 1.h; - const half ml = d_all * sc * 32.h; - const half dl = d_all * sc * coef; + const float coef = il>1 ? 1.f/16.f : 1.f; + const float ml = d_all * sc * 32.f; + const float dl = d_all * sc * coef; for (int i = 0; i < 16; ++i) { const half q = il&1 ? ((ql[i] & kmask2) | ((qh[i] & kmask1) << 2)) : ((ql[i] & kmask2) | ((qh[i] & kmask1) << 4)); @@ -3684,6 +4241,52 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg } } +template +void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + // each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's. + device const uint16_t * q2 = xb->qs + 4*ib32; + const uint32_t aux32_g = q2[0] | (q2[1] << 16); + const uint32_t aux32_s = q2[2] | (q2[3] << 16); + thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; + const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -3764,48 +4367,212 @@ kernel void kernel_get_rows_f16( const int64_t i10 = tgpig.x; const int64_t i11 = tgpig.y; - const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; + + const int64_t i02 = i11; + + for (int ind = tiitg; ind < ne00; ind += tptg.x) { + ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = + ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + } +} + +kernel void kernel_get_rows_i32( + device const void * src0, + device const char * src1, + device int32_t * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb1, + constant uint64_t & nb2, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int64_t i10 = tgpig.x; + const int64_t i11 = tgpig.y; + + const int64_t r = ((device int32_t *) ((device char *) src1 + i11*nb11 + i10*nb10))[0]; + + const int64_t i02 = i11; + + for (int ind = tiitg; ind < ne00; ind += tptg.x) { + ((device int32_t *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = + ((device int32_t *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + } +} + + +#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B +#define BLOCK_SIZE_K 32 +#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A +#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B +#define THREAD_PER_BLOCK 128 +#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers +#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers +#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8 +#define SG_MAT_ROW 8 + +// each block_q contains 16*nl weights +template +void kernel_mul_mm_impl(device const uchar * src0, + device const uchar * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup half * sa = (threadgroup half *)(shared_memory); + threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); + + const uint r0 = tgpig.y; + const uint r1 = tgpig.x; + const uint im = tgpig.z; + + // if this block is of 64x32 shape or smaller + short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + + // a thread shouldn't load data outside of the matrix + short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + + simdgroup_half8x8 ma[4]; + simdgroup_float8x8 mb[2]; + simdgroup_float8x8 c_res[8]; + for (int i = 0; i < 8; i++){ + c_res[i] = make_filled_simdgroup_matrix(0.f); + } + + short il = (tiitg % THREAD_PER_ROW); + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); + ushort offset1 = il/nl; + + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = (device const float *)(src1 + + nb12 * im + + nb11 * (r1 * BLOCK_SIZE_N + thread_col) + + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); + + for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + // load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); + + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + } + + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2+nl-1)/nl : x; + y += BLOCK_SIZE_K; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // load matrices from threadgroup memory and conduct outer products + threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + + #pragma unroll(4) + for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (int i = 0; i < 4; i++) { + simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) + for (int i = 0; i < 2; i++) { + simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + } + + lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; - const int64_t i02 = i11; + #pragma unroll(8) + for (int i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + } + } + } - for (int ind = tiitg; ind < ne00; ind += tptg.x) { - ((device float *) ((device char *) dst + i11*nb2 + i10*nb1))[ind] = - ((device half *) ((device char *) src0 + r*nb01 + i02*nb02))[ind]; + if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { + device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg == 0) { + for (int i = 0; i < n_rows; i++) { + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { + *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + } + } + } } } -#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A -#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B -#define BLOCK_SIZE_K 32 -#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A -#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B -#define THREAD_PER_BLOCK 128 -#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers -#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers -#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8 -#define SG_MAT_ROW 8 - -// each block_q contains 16*nl weights +// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids template -void kernel_mul_mm_impl(device const uchar * src0, - device const uchar * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, - constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup uchar * shared_memory [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { +void kernel_mul_mm_id_impl( + device const uchar * src0, + device const uchar * src1, + thread short * src1ids, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + int64_t ne1, + constant uint & r2, + constant uint & r3, + threadgroup uchar * shared_memory, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { threadgroup half * sa = (threadgroup half *)(shared_memory); threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); @@ -3814,6 +4581,8 @@ void kernel_mul_mm_impl(device const uchar * src0, const uint r1 = tgpig.x; const uint im = tgpig.z; + if (r1 * BLOCK_SIZE_N >= ne1) return; + // if this block is of 64x32 shape or smaller short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; @@ -3840,7 +4609,7 @@ void kernel_mul_mm_impl(device const uchar * src0, device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; device const float * y = (device const float *)(src1 + nb12 * im - + nb11 * (r1 * BLOCK_SIZE_N + thread_col) + + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col] + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { @@ -3849,7 +4618,6 @@ void kernel_mul_mm_impl(device const uchar * src0, dequantize_func(x, il, temp_a); threadgroup_barrier(mem_flags::mem_threadgroup); - #pragma unroll(16) for (int i = 0; i < 16; i++) { *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ @@ -3868,14 +4636,11 @@ void kernel_mul_mm_impl(device const uchar * src0, threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); - #pragma unroll(4) for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { - #pragma unroll(4) for (int i = 0; i < 4; i++) { simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); } simdgroup_barrier(mem_flags::mem_none); - #pragma unroll(2) for (int i = 0; i < 2; i++) { simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); } @@ -3883,21 +4648,13 @@ void kernel_mul_mm_impl(device const uchar * src0, lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; - #pragma unroll(8) for (int i = 0; i < 8; i++){ simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); } } } - if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { - device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \ - + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0; - for (int i = 0; i < 8; i++) { - simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix + { threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; @@ -3907,11 +4664,11 @@ void kernel_mul_mm_impl(device const uchar * src0, threadgroup_barrier(mem_flags::mem_threadgroup); - device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0; if (sgitg == 0) { for (int i = 0; i < n_rows; i++) { for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); } } } @@ -3924,12 +4681,12 @@ kernel void kernel_mul_mm(device const uchar * src0, device float * dst, constant int64_t & ne00, constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -3964,20 +4721,20 @@ template( - src0[id], - src1 + bid*nb11, - (device float *) (dst + bid*nb1), + for (int64_t i1 = 0; i1 < ne1; i1++) { + if (((device int32_t *) (ids + i1*nbi1))[idx] == id) { + src1ids[_ne1++] = i1; + } + } + + kernel_mul_mm_id_impl( + src0s[id], + src1, + src1ids, + dst, ne00, ne02, nb01, @@ -4014,7 +4781,7 @@ kernel void kernel_mul_mm_id( nb11, nb12, ne0, - ne1, + _ne1, r2, r3, shared_memory, @@ -4059,6 +4826,8 @@ template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -4070,12 +4839,12 @@ typedef void (mat_mm_t)( device float * dst, constant int64_t & ne00, constant int64_t & ne02, - constant int64_t & nb01, - constant int64_t & nb02, + constant uint64_t & nb01, + constant uint64_t & nb02, constant int64_t & ne12, - constant int64_t & nb10, - constant int64_t & nb11, - constant int64_t & nb12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, constant uint & r2, @@ -4095,6 +4864,8 @@ template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -4103,20 +4874,20 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -4152,8 +4925,8 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu kernel void kernel_mul_mv_id_f32_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4169,7 +4942,7 @@ kernel void kernel_mul_mv_id_f32_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4196,7 +4969,7 @@ kernel void kernel_mul_mv_id_f32_f32( kernel_mul_mv_f32_f32_impl( src0[id], src1 + bid*nb11, - (device float *) (dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4221,8 +4994,8 @@ kernel void kernel_mul_mv_id_f32_f32( kernel void kernel_mul_mv_id_f16_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4238,7 +5011,7 @@ kernel void kernel_mul_mv_id_f16_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4265,7 +5038,7 @@ kernel void kernel_mul_mv_id_f16_f32( kernel_mul_mv_f16_f32_impl( src0[id], src1 + bid*nb11, - (device float *) (dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4290,8 +5063,8 @@ kernel void kernel_mul_mv_id_f16_f32( kernel void kernel_mul_mv_id_q8_0_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4307,7 +5080,7 @@ kernel void kernel_mul_mv_id_q8_0_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4334,7 +5107,7 @@ kernel void kernel_mul_mv_id_q8_0_f32( kernel_mul_mv_q8_0_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4353,8 +5126,8 @@ kernel void kernel_mul_mv_id_q8_0_f32( kernel void kernel_mul_mv_id_q4_0_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4370,7 +5143,7 @@ kernel void kernel_mul_mv_id_q4_0_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4397,7 +5170,7 @@ kernel void kernel_mul_mv_id_q4_0_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4416,8 +5189,8 @@ kernel void kernel_mul_mv_id_q4_0_f32( kernel void kernel_mul_mv_id_q4_1_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4433,7 +5206,7 @@ kernel void kernel_mul_mv_id_q4_1_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4460,7 +5233,7 @@ kernel void kernel_mul_mv_id_q4_1_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4479,8 +5252,8 @@ kernel void kernel_mul_mv_id_q4_1_f32( kernel void kernel_mul_mv_id_q5_0_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4496,7 +5269,7 @@ kernel void kernel_mul_mv_id_q5_0_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4523,7 +5296,7 @@ kernel void kernel_mul_mv_id_q5_0_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4542,8 +5315,8 @@ kernel void kernel_mul_mv_id_q5_0_f32( kernel void kernel_mul_mv_id_q5_1_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4559,7 +5332,7 @@ kernel void kernel_mul_mv_id_q5_1_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4586,7 +5359,7 @@ kernel void kernel_mul_mv_id_q5_1_f32( mul_vec_q_n_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4605,8 +5378,8 @@ kernel void kernel_mul_mv_id_q5_1_f32( kernel void kernel_mul_mv_id_q2_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4622,7 +5395,7 @@ kernel void kernel_mul_mv_id_q2_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4649,7 +5422,7 @@ kernel void kernel_mul_mv_id_q2_K_f32( kernel_mul_mv_q2_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4668,8 +5441,8 @@ kernel void kernel_mul_mv_id_q2_K_f32( kernel void kernel_mul_mv_id_q3_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4685,7 +5458,7 @@ kernel void kernel_mul_mv_id_q3_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4712,7 +5485,7 @@ kernel void kernel_mul_mv_id_q3_K_f32( kernel_mul_mv_q3_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4731,8 +5504,8 @@ kernel void kernel_mul_mv_id_q3_K_f32( kernel void kernel_mul_mv_id_q4_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4748,7 +5521,7 @@ kernel void kernel_mul_mv_id_q4_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4775,7 +5548,7 @@ kernel void kernel_mul_mv_id_q4_K_f32( kernel_mul_mv_q4_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4794,8 +5567,8 @@ kernel void kernel_mul_mv_id_q4_K_f32( kernel void kernel_mul_mv_id_q5_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4811,7 +5584,7 @@ kernel void kernel_mul_mv_id_q5_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4838,7 +5611,7 @@ kernel void kernel_mul_mv_id_q5_K_f32( kernel_mul_mv_q5_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, ne00, ne01, ne02, @@ -4857,8 +5630,8 @@ kernel void kernel_mul_mv_id_q5_K_f32( kernel void kernel_mul_mv_id_q6_K_f32( device const char * ids, device const char * src1, - device uchar * dst, - constant int64_t & nbi1, + device float * dst, + constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -4874,7 +5647,7 @@ kernel void kernel_mul_mv_id_q6_K_f32( constant uint64_t & nb12, constant int64_t & ne0, constant int64_t & ne1, - constant int64_t & nb1, + constant uint64_t & nb1, constant uint & r2, constant uint & r3, constant int & idx, @@ -4901,7 +5674,136 @@ kernel void kernel_mul_mv_id_q6_K_f32( kernel_mul_mv_q6_K_f32_impl( src0[id], (device const float *) (src1 + bid*nb11), - (device float *) ( dst + bid*nb1), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] +kernel void kernel_mul_mv_id_iq2_xxs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq2_xxs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} + +[[host_name("kernel_mul_mv_id_iq2_xs_f32")]] +kernel void kernel_mul_mv_id_iq2_xs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq2_xs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, ne00, ne01, ne02, @@ -4911,6 +5813,7 @@ kernel void kernel_mul_mv_id_q6_K_f32( ne1, r2, r3, + shared_values, tgpig, tiisg, sgitg); diff --git a/cpp/ggml-metal.h b/cpp/ggml-metal.h index 6e5291d9..6f3495c7 100644 --- a/cpp/ggml-metal.h +++ b/cpp/ggml-metal.h @@ -36,69 +36,21 @@ struct lm_ggml_cgraph; extern "C" { #endif -// -// internal API -// temporary exposed to user-code -// - -struct lm_ggml_metal_context; - -void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data); - -// number of command buffers to use -struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb); -void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx); - -void * lm_ggml_metal_host_malloc(size_t n); -void lm_ggml_metal_host_free (void * data); - -// set the number of command buffers to use -void lm_ggml_metal_set_n_cb(struct lm_ggml_metal_context * ctx, int n_cb); - -// creates a mapping between a host memory buffer and a device memory buffer -// - make sure to map all buffers used in the graph before calling lm_ggml_metal_graph_compute -// - the mapping is used during computation to determine the arguments of the compute kernels -// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal -// - max_size specifies the maximum size of a tensor and is used to create shared views such -// that it is guaranteed that the tensor will fit in at least one of the views -// -bool lm_ggml_metal_add_buffer( - struct lm_ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size); - -// set data from host memory into the device -void lm_ggml_metal_set_tensor(struct lm_ggml_metal_context * ctx, struct lm_ggml_tensor * t); - -// get data from the device into host memory -void lm_ggml_metal_get_tensor(struct lm_ggml_metal_context * ctx, struct lm_ggml_tensor * t); - -// try to find operations that can be run concurrently in the graph -// you should run it again if the topology of your graph changes -void lm_ggml_metal_graph_find_concurrency(struct lm_ggml_metal_context * ctx, struct lm_ggml_cgraph * gf, bool check_mem); - -// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized -int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx); - -// output the concur_list for lm_ggml_alloc -int * lm_ggml_metal_get_concur_list(struct lm_ggml_metal_context * ctx); - -// same as lm_ggml_graph_compute but uses Metal -// creates gf->n_threads command buffers in parallel -void lm_ggml_metal_graph_compute(struct lm_ggml_metal_context * ctx, struct lm_ggml_cgraph * gf); - // // backend API // user-code should use only these functions // +LM_GGML_API void lm_ggml_backend_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data); + LM_GGML_API lm_ggml_backend_t lm_ggml_backend_metal_init(void); LM_GGML_API bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend); +LM_GGML_API lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); + LM_GGML_API void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb); + LM_GGML_API lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/cpp/ggml-metal.m b/cpp/ggml-metal.m index d4c6e0d1..d2a30636 100644 --- a/cpp/ggml-metal.m +++ b/cpp/ggml-metal.m @@ -24,7 +24,7 @@ #define UNUSED(x) (void)(x) -#define LM_GGML_MAX_CONCUR (2*LM_GGML_DEFAULT_GRAPH_SIZE) +#define LM_GGML_METAL_MAX_KERNELS 256 struct lm_ggml_metal_buffer { const char * name; @@ -35,6 +35,134 @@ id metal; }; +struct lm_ggml_metal_kernel { + id function; + id pipeline; +}; + +enum lm_ggml_metal_kernel_type { + LM_GGML_METAL_KERNEL_TYPE_ADD, + LM_GGML_METAL_KERNEL_TYPE_ADD_ROW, + LM_GGML_METAL_KERNEL_TYPE_MUL, + LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, + LM_GGML_METAL_KERNEL_TYPE_DIV, + LM_GGML_METAL_KERNEL_TYPE_DIV_ROW, + LM_GGML_METAL_KERNEL_TYPE_SCALE, + LM_GGML_METAL_KERNEL_TYPE_SCALE_4, + LM_GGML_METAL_KERNEL_TYPE_TANH, + LM_GGML_METAL_KERNEL_TYPE_RELU, + LM_GGML_METAL_KERNEL_TYPE_GELU, + LM_GGML_METAL_KERNEL_TYPE_GELU_QUICK, + LM_GGML_METAL_KERNEL_TYPE_SILU, + LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX, + LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, + LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, + LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, + LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, + LM_GGML_METAL_KERNEL_TYPE_RMS_NORM, + LM_GGML_METAL_KERNEL_TYPE_GROUP_NORM, + LM_GGML_METAL_KERNEL_TYPE_NORM, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, + //LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, + //LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, + //LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, + LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, + LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, + LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, + LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, + LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, + LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, + LM_GGML_METAL_KERNEL_TYPE_PAD_F32, + LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, + LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, + LM_GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, + LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F16, + LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32, + LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, + LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, + LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, + //LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, + //LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, + LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F16, + LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F32, + LM_GGML_METAL_KERNEL_TYPE_CONCAT, + LM_GGML_METAL_KERNEL_TYPE_SQR, + LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS, + + LM_GGML_METAL_KERNEL_TYPE_COUNT +}; + struct lm_ggml_metal_context { int n_cb; @@ -50,123 +178,10 @@ int n_buffers; struct lm_ggml_metal_buffer buffers[LM_GGML_METAL_MAX_BUFFERS]; - int concur_list[LM_GGML_MAX_CONCUR]; - int concur_list_len; - - // custom kernels -#define LM_GGML_METAL_DECL_KERNEL(name) \ - id function_##name; \ - id pipeline_##name - - LM_GGML_METAL_DECL_KERNEL(add); - LM_GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast - LM_GGML_METAL_DECL_KERNEL(mul); - LM_GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast - LM_GGML_METAL_DECL_KERNEL(div); - LM_GGML_METAL_DECL_KERNEL(div_row); - LM_GGML_METAL_DECL_KERNEL(scale); - LM_GGML_METAL_DECL_KERNEL(scale_4); - LM_GGML_METAL_DECL_KERNEL(tanh); - LM_GGML_METAL_DECL_KERNEL(relu); - LM_GGML_METAL_DECL_KERNEL(gelu); - LM_GGML_METAL_DECL_KERNEL(gelu_quick); - LM_GGML_METAL_DECL_KERNEL(silu); - LM_GGML_METAL_DECL_KERNEL(soft_max); - LM_GGML_METAL_DECL_KERNEL(soft_max_4); - LM_GGML_METAL_DECL_KERNEL(diag_mask_inf); - LM_GGML_METAL_DECL_KERNEL(diag_mask_inf_8); - LM_GGML_METAL_DECL_KERNEL(get_rows_f32); - LM_GGML_METAL_DECL_KERNEL(get_rows_f16); - LM_GGML_METAL_DECL_KERNEL(get_rows_q4_0); - LM_GGML_METAL_DECL_KERNEL(get_rows_q4_1); - LM_GGML_METAL_DECL_KERNEL(get_rows_q5_0); - LM_GGML_METAL_DECL_KERNEL(get_rows_q5_1); - LM_GGML_METAL_DECL_KERNEL(get_rows_q8_0); - LM_GGML_METAL_DECL_KERNEL(get_rows_q2_K); - LM_GGML_METAL_DECL_KERNEL(get_rows_q3_K); - LM_GGML_METAL_DECL_KERNEL(get_rows_q4_K); - LM_GGML_METAL_DECL_KERNEL(get_rows_q5_K); - LM_GGML_METAL_DECL_KERNEL(get_rows_q6_K); - LM_GGML_METAL_DECL_KERNEL(rms_norm); - LM_GGML_METAL_DECL_KERNEL(group_norm); - LM_GGML_METAL_DECL_KERNEL(norm); - LM_GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f16); - LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row); - LM_GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q5_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q5_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32); - //LM_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32); - //LM_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row); - //LM_GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_1_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q8_0_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q2_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q3_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); - LM_GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); - LM_GGML_METAL_DECL_KERNEL(rope_f32); - LM_GGML_METAL_DECL_KERNEL(rope_f16); - LM_GGML_METAL_DECL_KERNEL(alibi_f32); - LM_GGML_METAL_DECL_KERNEL(im2col_f16); - LM_GGML_METAL_DECL_KERNEL(upscale_f32); - LM_GGML_METAL_DECL_KERNEL(pad_f32); - LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc); - LM_GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc); - LM_GGML_METAL_DECL_KERNEL(leaky_relu_f32); - LM_GGML_METAL_DECL_KERNEL(cpy_f32_f16); - LM_GGML_METAL_DECL_KERNEL(cpy_f32_f32); - LM_GGML_METAL_DECL_KERNEL(cpy_f32_q8_0); - LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_0); - LM_GGML_METAL_DECL_KERNEL(cpy_f32_q4_1); - //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_0); - //LM_GGML_METAL_DECL_KERNEL(cpy_f32_q5_1); - LM_GGML_METAL_DECL_KERNEL(cpy_f16_f16); - LM_GGML_METAL_DECL_KERNEL(cpy_f16_f32); - LM_GGML_METAL_DECL_KERNEL(concat); - LM_GGML_METAL_DECL_KERNEL(sqr); - LM_GGML_METAL_DECL_KERNEL(sum_rows); - -#undef LM_GGML_METAL_DECL_KERNEL + struct lm_ggml_metal_kernel kernels[LM_GGML_METAL_MAX_KERNELS]; + + bool support_simdgroup_reduction; + bool support_simdgroup_mm; }; // MSL code @@ -180,14 +195,16 @@ @interface LMGGMLMetalClass : NSObject @implementation LMGGMLMetalClass @end -lm_ggml_log_callback lm_ggml_metal_log_callback = NULL; -void * lm_ggml_metal_log_user_data = NULL; +static void lm_ggml_metal_default_log_callback(enum lm_ggml_log_level level, const char * msg, void * user_data) { + fprintf(stderr, "%s", msg); -void lm_ggml_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data) { - lm_ggml_metal_log_callback = log_callback; - lm_ggml_metal_log_user_data = user_data; + UNUSED(level); + UNUSED(user_data); } +lm_ggml_log_callback lm_ggml_metal_log_callback = lm_ggml_metal_default_log_callback; +void * lm_ggml_metal_log_user_data = NULL; + LM_GGML_ATTRIBUTE_FORMAT(2, 3) static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, ...){ if (lm_ggml_metal_log_callback != NULL) { @@ -210,7 +227,18 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, } } -struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { +static void * lm_ggml_metal_host_malloc(size_t n) { + void * data = NULL; + const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); + if (result != 0) { + LM_GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); + return NULL; + } + + return data; +} + +static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) { LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__); id device; @@ -236,7 +264,6 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; - ctx->concur_list_len = 0; ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); @@ -251,6 +278,7 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, NSError * error = nil; NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"]; if (libPath != nil) { + // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:libPath]; LM_GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); ctx->library = [ctx->device newLibraryWithURL:libURL error:&error]; @@ -278,12 +306,22 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, return NULL; } - MTLCompileOptions* options = nil; + // dictionary of preprocessor macros + NSMutableDictionary * prep = [NSMutableDictionary dictionary]; + #ifdef LM_GGML_QKK_64 - options = [MTLCompileOptions new]; - options.preprocessorMacros = @{ @"QK_K" : @(64) }; + prep[@"QK_K"] = @(64); #endif + + MTLCompileOptions* options = [MTLCompileOptions new]; + options.preprocessorMacros = prep; + + //[options setFastMathEnabled:false]; + ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; + + [options release]; + [prep release]; } if (error) { @@ -292,21 +330,46 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, } } -#if TARGET_OS_OSX // print MTL GPU family: LM_GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + const NSInteger MTLGPUFamilyMetal3 = 5001; + // determine max supported GPU family // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { - if ([ctx->device supportsFamily:i]) { - LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); - break; + { + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - (int) MTLGPUFamilyApple1 + 1, i); + break; + } + } + + for (int i = MTLGPUFamilyCommon1 + 5; i >= MTLGPUFamilyCommon1; --i) { + if ([ctx->device supportsFamily:i]) { + LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyCommon%d (%d)\n", __func__, i - (int) MTLGPUFamilyCommon1 + 1, i); + break; + } + } + + for (int i = MTLGPUFamilyMetal3 + 5; i >= MTLGPUFamilyMetal3; --i) { + if ([ctx->device supportsFamily:i]) { + LM_GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyMetal%d (%d)\n", __func__, i - (int) MTLGPUFamilyMetal3 + 3, i); + break; + } } } + ctx->support_simdgroup_reduction = [ctx->device supportsFamily:MTLGPUFamilyApple7]; + ctx->support_simdgroup_reduction |= [ctx->device supportsFamily:MTLGPUFamilyMetal3]; + + ctx->support_simdgroup_mm = [ctx->device supportsFamily:MTLGPUFamilyApple7]; + + LM_GGML_METAL_LOG_INFO("%s: simdgroup reduction support = %s\n", __func__, ctx->support_simdgroup_reduction ? "true" : "false"); + LM_GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); LM_GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); +#if TARGET_OS_OSX LM_GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); if (ctx->device.maxTransferRate != 0) { LM_GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6); @@ -319,257 +382,171 @@ static void lm_ggml_metal_log(enum lm_ggml_log_level level, const char * format, { NSError * error = nil; + for (int i = 0; i < LM_GGML_METAL_MAX_KERNELS; ++i) { + ctx->kernels[i].function = nil; + ctx->kernels[i].pipeline = nil; + } + /* - LM_GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ - (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ - (int) ctx->pipeline_##name.threadExecutionWidth); \ + LM_GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ + (int) kernel->pipeline.threadExecutionWidth); \ */ -#define LM_GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - if (error) { \ - LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ - return NULL; \ +#define LM_GGML_METAL_ADD_KERNEL(e, name, supported) \ + if (supported) { \ + struct lm_ggml_metal_kernel * kernel = &ctx->kernels[e]; \ + kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \ + if (error) { \ + LM_GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + return NULL; \ + } \ + } else { \ + LM_GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \ } - LM_GGML_METAL_ADD_KERNEL(add); - LM_GGML_METAL_ADD_KERNEL(add_row); - LM_GGML_METAL_ADD_KERNEL(mul); - LM_GGML_METAL_ADD_KERNEL(mul_row); - LM_GGML_METAL_ADD_KERNEL(div); - LM_GGML_METAL_ADD_KERNEL(div_row); - LM_GGML_METAL_ADD_KERNEL(scale); - LM_GGML_METAL_ADD_KERNEL(scale_4); - LM_GGML_METAL_ADD_KERNEL(tanh); - LM_GGML_METAL_ADD_KERNEL(relu); - LM_GGML_METAL_ADD_KERNEL(gelu); - LM_GGML_METAL_ADD_KERNEL(gelu_quick); - LM_GGML_METAL_ADD_KERNEL(silu); - LM_GGML_METAL_ADD_KERNEL(soft_max); - LM_GGML_METAL_ADD_KERNEL(soft_max_4); - LM_GGML_METAL_ADD_KERNEL(diag_mask_inf); - LM_GGML_METAL_ADD_KERNEL(diag_mask_inf_8); - LM_GGML_METAL_ADD_KERNEL(get_rows_f32); - LM_GGML_METAL_ADD_KERNEL(get_rows_f16); - LM_GGML_METAL_ADD_KERNEL(get_rows_q4_0); - LM_GGML_METAL_ADD_KERNEL(get_rows_q4_1); - LM_GGML_METAL_ADD_KERNEL(get_rows_q5_0); - LM_GGML_METAL_ADD_KERNEL(get_rows_q5_1); - LM_GGML_METAL_ADD_KERNEL(get_rows_q8_0); - LM_GGML_METAL_ADD_KERNEL(get_rows_q2_K); - LM_GGML_METAL_ADD_KERNEL(get_rows_q3_K); - LM_GGML_METAL_ADD_KERNEL(get_rows_q4_K); - LM_GGML_METAL_ADD_KERNEL(get_rows_q5_K); - LM_GGML_METAL_ADD_KERNEL(get_rows_q6_K); - LM_GGML_METAL_ADD_KERNEL(rms_norm); - LM_GGML_METAL_ADD_KERNEL(group_norm); - LM_GGML_METAL_ADD_KERNEL(norm); - LM_GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f16); - LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row); - LM_GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q5_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q5_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32); - //LM_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32); - //LM_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row); - //LM_GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32); - if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { - LM_GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_1_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q8_0_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q2_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q3_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); - LM_GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); - } - LM_GGML_METAL_ADD_KERNEL(rope_f32); - LM_GGML_METAL_ADD_KERNEL(rope_f16); - LM_GGML_METAL_ADD_KERNEL(alibi_f32); - LM_GGML_METAL_ADD_KERNEL(im2col_f16); - LM_GGML_METAL_ADD_KERNEL(upscale_f32); - LM_GGML_METAL_ADD_KERNEL(pad_f32); - LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc); - LM_GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc); - LM_GGML_METAL_ADD_KERNEL(leaky_relu_f32); - LM_GGML_METAL_ADD_KERNEL(cpy_f32_f16); - LM_GGML_METAL_ADD_KERNEL(cpy_f32_f32); - LM_GGML_METAL_ADD_KERNEL(cpy_f32_q8_0); - LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_0); - LM_GGML_METAL_ADD_KERNEL(cpy_f32_q4_1); - //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_0); - //LM_GGML_METAL_ADD_KERNEL(cpy_f32_q5_1); - LM_GGML_METAL_ADD_KERNEL(cpy_f16_f16); - LM_GGML_METAL_ADD_KERNEL(cpy_f16_f32); - LM_GGML_METAL_ADD_KERNEL(concat); - LM_GGML_METAL_ADD_KERNEL(sqr); - LM_GGML_METAL_ADD_KERNEL(sum_rows); - -#undef LM_GGML_METAL_ADD_KERNEL + // simd_sum and simd_max requires MTLGPUFamilyApple7 + + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD, add, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL, mul, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV, div, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SCALE, scale, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_TANH, tanh, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RELU, relu, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GELU, gelu, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SILU, silu, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_NORM, norm, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true); + //LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SQR, sqr, true); + LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); } return ctx; } -void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { +static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__); -#define LM_GGML_METAL_DEL_KERNEL(name) \ - [ctx->function_##name release]; \ - [ctx->pipeline_##name release]; - - LM_GGML_METAL_DEL_KERNEL(add); - LM_GGML_METAL_DEL_KERNEL(add_row); - LM_GGML_METAL_DEL_KERNEL(mul); - LM_GGML_METAL_DEL_KERNEL(mul_row); - LM_GGML_METAL_DEL_KERNEL(div); - LM_GGML_METAL_DEL_KERNEL(div_row); - LM_GGML_METAL_DEL_KERNEL(scale); - LM_GGML_METAL_DEL_KERNEL(scale_4); - LM_GGML_METAL_DEL_KERNEL(tanh); - LM_GGML_METAL_DEL_KERNEL(relu); - LM_GGML_METAL_DEL_KERNEL(gelu); - LM_GGML_METAL_DEL_KERNEL(gelu_quick); - LM_GGML_METAL_DEL_KERNEL(silu); - LM_GGML_METAL_DEL_KERNEL(soft_max); - LM_GGML_METAL_DEL_KERNEL(soft_max_4); - LM_GGML_METAL_DEL_KERNEL(diag_mask_inf); - LM_GGML_METAL_DEL_KERNEL(diag_mask_inf_8); - LM_GGML_METAL_DEL_KERNEL(get_rows_f32); - LM_GGML_METAL_DEL_KERNEL(get_rows_f16); - LM_GGML_METAL_DEL_KERNEL(get_rows_q4_0); - LM_GGML_METAL_DEL_KERNEL(get_rows_q4_1); - LM_GGML_METAL_DEL_KERNEL(get_rows_q5_0); - LM_GGML_METAL_DEL_KERNEL(get_rows_q5_1); - LM_GGML_METAL_DEL_KERNEL(get_rows_q8_0); - LM_GGML_METAL_DEL_KERNEL(get_rows_q2_K); - LM_GGML_METAL_DEL_KERNEL(get_rows_q3_K); - LM_GGML_METAL_DEL_KERNEL(get_rows_q4_K); - LM_GGML_METAL_DEL_KERNEL(get_rows_q5_K); - LM_GGML_METAL_DEL_KERNEL(get_rows_q6_K); - LM_GGML_METAL_DEL_KERNEL(rms_norm); - LM_GGML_METAL_DEL_KERNEL(group_norm); - LM_GGML_METAL_DEL_KERNEL(norm); - LM_GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f16); - LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row); - LM_GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q5_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q5_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32); - //LM_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32); - //LM_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row); - //LM_GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32); - if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { - LM_GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_1_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q8_0_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q2_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q3_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); - LM_GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); - } - LM_GGML_METAL_DEL_KERNEL(rope_f32); - LM_GGML_METAL_DEL_KERNEL(rope_f16); - LM_GGML_METAL_DEL_KERNEL(alibi_f32); - LM_GGML_METAL_DEL_KERNEL(im2col_f16); - LM_GGML_METAL_DEL_KERNEL(upscale_f32); - LM_GGML_METAL_DEL_KERNEL(pad_f32); - LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc); - LM_GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc); - LM_GGML_METAL_DEL_KERNEL(leaky_relu_f32); - LM_GGML_METAL_DEL_KERNEL(cpy_f32_f16); - LM_GGML_METAL_DEL_KERNEL(cpy_f32_f32); - LM_GGML_METAL_DEL_KERNEL(cpy_f32_q8_0); - LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_0); - LM_GGML_METAL_DEL_KERNEL(cpy_f32_q4_1); - //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_0); - //LM_GGML_METAL_DEL_KERNEL(cpy_f32_q5_1); - LM_GGML_METAL_DEL_KERNEL(cpy_f16_f16); - LM_GGML_METAL_DEL_KERNEL(cpy_f16_f32); - LM_GGML_METAL_DEL_KERNEL(concat); - LM_GGML_METAL_DEL_KERNEL(sqr); - LM_GGML_METAL_DEL_KERNEL(sum_rows); - -#undef LM_GGML_METAL_DEL_KERNEL for (int i = 0; i < ctx->n_buffers; ++i) { [ctx->buffers[i].metal release]; } + for (int i = 0; i < LM_GGML_METAL_MAX_KERNELS; ++i) { + if (ctx->kernels[i].pipeline) { + [ctx->kernels[i].pipeline release]; + } + + if (ctx->kernels[i].function) { + [ctx->kernels[i].function release]; + } + } + [ctx->library release]; [ctx->queue release]; [ctx->device release]; @@ -579,38 +556,23 @@ void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) { free(ctx); } -void * lm_ggml_metal_host_malloc(size_t n) { - void * data = NULL; - const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); - if (result != 0) { - LM_GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); - return NULL; - } - - return data; -} - -void lm_ggml_metal_host_free(void * data) { - free(data); -} - -void lm_ggml_metal_set_n_cb(struct lm_ggml_metal_context * ctx, int n_cb) { - ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); -} +// temporarily defined here for compatibility between ggml-backend and the old API -int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { - return ctx->concur_list_len; -} +struct lm_ggml_backend_metal_buffer { + void * data; + size_t size; -int * lm_ggml_metal_get_concur_list(struct lm_ggml_metal_context * ctx) { - return ctx->concur_list; -} + id metal; +}; -// temporarily defined here for compatibility between ggml-backend and the old API struct lm_ggml_backend_metal_buffer_context { - void * data; + void * all_data; + size_t all_size; + bool owned; - id metal; + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + struct lm_ggml_backend_metal_buffer buffers[LM_GGML_METAL_MAX_BUFFERS]; }; // finds the Metal buffer that contains the tensor data on the GPU device @@ -622,17 +584,29 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { const int64_t tsize = lm_ggml_nbytes(t); + lm_ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + // compatibility with ggml-backend - if (t->buffer && t->buffer->buft == lm_ggml_backend_metal_buffer_type()) { - struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) t->buffer->context; + if (buffer && buffer->buft == lm_ggml_backend_metal_buffer_type()) { + struct lm_ggml_backend_metal_buffer_context * buf_ctx = (struct lm_ggml_backend_metal_buffer_context *) buffer->context; + + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + //LM_GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; - LM_GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + //LM_GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); + + return buf_ctx->buffers[i].metal; + } + } - *offs = (size_t) ioffs; + LM_GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); - return buf_ctx->metal; + return nil; } // find the view that contains the tensor fully @@ -654,210 +628,7 @@ int lm_ggml_metal_if_optimized(struct lm_ggml_metal_context * ctx) { return nil; } -bool lm_ggml_metal_add_buffer( - struct lm_ggml_metal_context * ctx, - const char * name, - void * data, - size_t size, - size_t max_size) { - if (ctx->n_buffers >= LM_GGML_METAL_MAX_BUFFERS) { - LM_GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__); - return false; - } - - if (data) { - // verify that the buffer does not overlap with any of the existing buffers - for (int i = 0; i < ctx->n_buffers; ++i) { - const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; - - if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - LM_GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); - return false; - } - } - - const size_t size_page = sysconf(_SC_PAGESIZE); - - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } - - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= ctx->device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); - return false; - } - - LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = ctx->device.maxBufferLength - size_ovlp; - const size_t size_view = ctx->device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); - return false; - } - - LM_GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); - if (i + size_step < size) { - LM_GGML_METAL_LOG_INFO("\n"); - } - - ++ctx->n_buffers; - } - } - -#if TARGET_OS_OSX - LM_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", - ctx->device.currentAllocatedSize / 1024.0 / 1024.0, - ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - - if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - LM_GGML_METAL_LOG_INFO("\n"); - } -#else - LM_GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); -#endif - } - - return true; -} - -void lm_ggml_metal_set_tensor( - struct lm_ggml_metal_context * ctx, - struct lm_ggml_tensor * t) { - size_t offs; - id id_dst = lm_ggml_metal_get_buffer(ctx, t, &offs); - - memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, lm_ggml_nbytes(t)); -} - -void lm_ggml_metal_get_tensor( - struct lm_ggml_metal_context * ctx, - struct lm_ggml_tensor * t) { - size_t offs; - id id_src = lm_ggml_metal_get_buffer(ctx, t, &offs); - - memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), lm_ggml_nbytes(t)); -} - -void lm_ggml_metal_graph_find_concurrency( - struct lm_ggml_metal_context * ctx, - struct lm_ggml_cgraph * gf, bool check_mem) { - int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time - int nodes_unused[LM_GGML_MAX_CONCUR]; - - for (int i = 0; i < LM_GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; } - for (int i = 0; i < gf->n_nodes; i++) { nodes_unused[i] = 1; } - ctx->concur_list_len = 0; - - int n_left = gf->n_nodes; - int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list - int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos - - while (n_left > 0) { - // number of nodes at a layer (that can be issued concurrently) - int concurrency = 0; - for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) { - if (nodes_unused[i]) { - // if the requirements for gf->nodes[i] are satisfied - int exe_flag = 1; - - // scan all srcs - for (int src_ind = 0; src_ind < LM_GGML_MAX_SRC; src_ind++) { - struct lm_ggml_tensor * src_cur = gf->nodes[i]->src[src_ind]; - if (src_cur) { - // if is leaf nodes it's satisfied. - // TODO: lm_ggml_is_leaf() - if (src_cur->op == LM_GGML_OP_NONE && src_cur->grad == NULL) { - continue; - } - - // otherwise this src should be the output from previous nodes. - int is_found = 0; - - // scan 2*search_depth back because we inserted barrier. - //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) { - for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) { - if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) { - is_found = 1; - break; - } - } - if (is_found == 0) { - exe_flag = 0; - break; - } - } - } - if (exe_flag && check_mem) { - // check if nodes[i]'s data will be overwritten by a node before nodes[i]. - // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] - int64_t data_start = (int64_t) gf->nodes[i]->data; - int64_t length = (int64_t) lm_ggml_nbytes(gf->nodes[i]); - for (int j = n_start; j < i; j++) { - if (nodes_unused[j] && gf->nodes[j]->op != LM_GGML_OP_RESHAPE \ - && gf->nodes[j]->op != LM_GGML_OP_VIEW \ - && gf->nodes[j]->op != LM_GGML_OP_TRANSPOSE \ - && gf->nodes[j]->op != LM_GGML_OP_PERMUTE) { - if (((int64_t)gf->nodes[j]->data) >= data_start + length || \ - ((int64_t)gf->nodes[j]->data) + (int64_t) lm_ggml_nbytes(gf->nodes[j]) <= data_start) { - continue; - } - - exe_flag = 0; - } - } - } - if (exe_flag) { - ctx->concur_list[level_pos + concurrency] = i; - nodes_unused[i] = 0; - concurrency++; - ctx->concur_list_len++; - } - } - } - n_left -= concurrency; - // adding a barrier different layer - ctx->concur_list[level_pos + concurrency] = -1; - ctx->concur_list_len++; - // jump all sorted nodes at nodes_bak - while (!nodes_unused[n_start]) { - n_start++; - } - level_pos += concurrency + 1; - } - - if (ctx->concur_list_len > LM_GGML_MAX_CONCUR) { - LM_GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__); - } -} - -static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) { +static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) { switch (op->op) { case LM_GGML_OP_UNARY: switch (lm_ggml_get_unary_op(op)) { @@ -883,9 +654,11 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) { case LM_GGML_OP_SCALE: case LM_GGML_OP_SQR: case LM_GGML_OP_SUM_ROWS: + return true; case LM_GGML_OP_SOFT_MAX: case LM_GGML_OP_RMS_NORM: case LM_GGML_OP_GROUP_NORM: + return ctx->support_simdgroup_reduction; case LM_GGML_OP_NORM: case LM_GGML_OP_ALIBI: case LM_GGML_OP_ROPE: @@ -894,9 +667,10 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) { case LM_GGML_OP_PAD: case LM_GGML_OP_ARGSORT: case LM_GGML_OP_LEAKY_RELU: + return true; case LM_GGML_OP_MUL_MAT: case LM_GGML_OP_MUL_MAT_ID: - return true; + return ctx->support_simdgroup_reduction; case LM_GGML_OP_CPY: case LM_GGML_OP_DUP: case LM_GGML_OP_CONT: @@ -934,19 +708,16 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_tensor * op) { return false; } } -void lm_ggml_metal_graph_compute( + +static bool lm_ggml_metal_graph_compute( struct lm_ggml_metal_context * ctx, struct lm_ggml_cgraph * gf) { @autoreleasepool { - // if there is ctx->concur_list, dispatch concurrently - // else fallback to serial dispatch MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; - const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= LM_GGML_MAX_CONCUR; - - const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes; - edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial; + const int n_nodes = gf->n_nodes; + edesc.dispatchType = MTLDispatchTypeSerial; // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel @@ -977,7 +748,7 @@ void lm_ggml_metal_graph_compute( const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); for (int ind = node_start; ind < node_end; ++ind) { - const int i = has_concur ? ctx->concur_list[ind] : ind; + const int i = ind; if (i == -1) { [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; @@ -1004,11 +775,15 @@ void lm_ggml_metal_graph_compute( } break; } - if (!lm_ggml_metal_supports_op(dst)) { + if (!lm_ggml_metal_supports_op(ctx, dst)) { LM_GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, lm_ggml_op_desc(dst)); LM_GGML_ASSERT(!"unsupported op"); } +#ifndef LM_GGML_METAL_NDEBUG + [encoder pushDebugGroup:[NSString stringWithCString:lm_ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; +#endif + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -1066,7 +841,9 @@ void lm_ggml_metal_graph_compute( { const int64_t nb = ne00; - [encoder setComputePipelineState:ctx->pipeline_concat]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1120,18 +897,18 @@ void lm_ggml_metal_graph_compute( nb = ne00 / 4; switch (dst->op) { - case LM_GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break; - case LM_GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break; - case LM_GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break; + case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; + case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; + case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; default: LM_GGML_ASSERT(false); } bcast_row = true; } else { switch (dst->op) { - case LM_GGML_OP_ADD: pipeline = ctx->pipeline_add; break; - case LM_GGML_OP_MUL: pipeline = ctx->pipeline_mul; break; - case LM_GGML_OP_DIV: pipeline = ctx->pipeline_div; break; + case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; + case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; + case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; default: LM_GGML_ASSERT(false); } } @@ -1198,9 +975,9 @@ void lm_ggml_metal_graph_compute( // not sure how to avoid this // TODO: make a simpler cpy_bytes kernel - const int nth = MIN(1024, ne00); + const id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; - [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -1220,10 +997,14 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } - [encoder setComputePipelineState:ctx->pipeline_add]; + const id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1253,7 +1034,7 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26]; [encoder setBytes:&offs length:sizeof(offs) atIndex:27]; - const int nth = MIN(1024, ne0); + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00); [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -1261,19 +1042,22 @@ void lm_ggml_metal_graph_compute( { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); - const float scale = *(const float *) src1->data; + const float scale = *(const float *) dst->op_params; int64_t n = lm_ggml_nelements(dst); + id pipeline = nil; + if (n % 4 == 0) { n /= 4; - [encoder setComputePipelineState:ctx->pipeline_scale_4]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline; } else { - [encoder setComputePipelineState:ctx->pipeline_scale]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SCALE].pipeline; } - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; @@ -1282,7 +1066,9 @@ void lm_ggml_metal_graph_compute( switch (lm_ggml_get_unary_op(gf->nodes[i])) { case LM_GGML_UNARY_OP_TANH: { - [encoder setComputePipelineState:ctx->pipeline_tanh]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_TANH].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1292,7 +1078,9 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_UNARY_OP_RELU: { - [encoder setComputePipelineState:ctx->pipeline_relu]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_RELU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1302,7 +1090,9 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_UNARY_OP_GELU: { - [encoder setComputePipelineState:ctx->pipeline_gelu]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GELU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1313,7 +1103,9 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_UNARY_OP_GELU_QUICK: { - [encoder setComputePipelineState:ctx->pipeline_gelu_quick]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1324,7 +1116,9 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_UNARY_OP_SILU: { - [encoder setComputePipelineState:ctx->pipeline_silu]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SILU].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -1343,18 +1137,23 @@ void lm_ggml_metal_graph_compute( { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); - [encoder setComputePipelineState:ctx->pipeline_sqr]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SQR].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; const int64_t n = lm_ggml_nelements(dst); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case LM_GGML_OP_SUM_ROWS: { LM_GGML_ASSERT(src0->nb[0] == lm_ggml_type_size(src0->type)); - [encoder setComputePipelineState:ctx->pipeline_sum_rows]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -1388,20 +1187,23 @@ void lm_ggml_metal_graph_compute( { int nth = 32; // SIMD width + id pipeline = nil; + if (ne00%4 == 0) { while (nth < ne00/4 && nth < 256) { nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_soft_max_4]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; } else { while (nth < ne00 && nth < 1024) { nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_soft_max]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; } const float scale = ((float *) dst->op_params)[0]; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; if (id_src1) { [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -1421,11 +1223,15 @@ void lm_ggml_metal_graph_compute( { const int n_past = ((int32_t *)(dst->op_params))[0]; + id pipeline = nil; + if (ne00%8 == 0) { - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline; } else { - [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline; } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -1485,21 +1291,28 @@ void lm_ggml_metal_graph_compute( ne00 % 32 == 0 && ne00 >= 64 && (ne11 > ne11_mm_min || (lm_ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + + id pipeline = nil; + switch (src0->type) { - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; - case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; - case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; - case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_0_f32]; break; - case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_1_f32]; break; - case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break; - case LM_GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; - case LM_GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; - case LM_GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; - case LM_GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; - case LM_GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32 ].pipeline; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32 ].pipeline; break; + case LM_GGML_TYPE_Q8_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q2_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q3_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; + case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; default: LM_GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1523,12 +1336,14 @@ void lm_ggml_metal_graph_compute( int nrows = 1; //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + id pipeline = nil; + // use custom matrix x vector kernel switch (src0t) { case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; nrows = 4; } break; case LM_GGML_TYPE_F16: @@ -1537,16 +1352,16 @@ void lm_ggml_metal_graph_compute( nth1 = 1; if (src1t == LM_GGML_TYPE_F32) { if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; nrows = ne11; } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline; nrows = 4; } } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline; nrows = 4; } } break; @@ -1554,61 +1369,73 @@ void lm_ggml_metal_graph_compute( { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline; } break; case LM_GGML_TYPE_Q4_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline; } break; case LM_GGML_TYPE_Q5_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline; } break; case LM_GGML_TYPE_Q5_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_1_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline; } break; case LM_GGML_TYPE_Q8_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline; } break; case LM_GGML_TYPE_Q2_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline; } break; case LM_GGML_TYPE_Q3_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline; } break; case LM_GGML_TYPE_Q4_K: { nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline; } break; case LM_GGML_TYPE_Q5_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline; } break; case LM_GGML_TYPE_Q6_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; } break; default: { @@ -1617,6 +1444,11 @@ void lm_ggml_metal_graph_compute( } }; + if (lm_ggml_is_quantized(src0t)) { + LM_GGML_ASSERT(ne00 >= nth0*nth1); + } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1642,6 +1474,11 @@ void lm_ggml_metal_graph_compute( src0t == LM_GGML_TYPE_Q2_K) { // || src0t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == LM_GGML_TYPE_IQ2_XXS || src0t == LM_GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == LM_GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1675,6 +1512,9 @@ void lm_ggml_metal_graph_compute( // TODO: make this more general LM_GGML_ASSERT(n_as <= 8); + // max size of the src1ids array in the kernel stack + LM_GGML_ASSERT(ne11 <= 512); + struct lm_ggml_tensor * src2 = gf->nodes[i]->src[2]; const int64_t ne20 = src2 ? src2->ne[0] : 0; @@ -1692,9 +1532,6 @@ void lm_ggml_metal_graph_compute( LM_GGML_ASSERT(!lm_ggml_is_transposed(src2)); LM_GGML_ASSERT(!lm_ggml_is_transposed(src1)); - LM_GGML_ASSERT(ne20 % 32 == 0); - // !!!!!!!!! TODO: this assert is probably required but not sure! - //LM_GGML_ASSERT(ne20 >= 64); LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); const uint r2 = ne12/ne22; @@ -1702,37 +1539,44 @@ void lm_ggml_metal_graph_compute( // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = 1; + int ne11_mm_min = n_as; const int idx = ((int32_t *) dst->op_params)[0]; // batch size LM_GGML_ASSERT(ne01 == ne11); - const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory - // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel // !!! // TODO: for now, always use mat-vec kernels until we figure out how to improve the // indirect matrix multiplication // !!! - if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) { + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne20 % 32 == 0 && ne20 >= 64 && + ne11 > ne11_mm_min) { + + id pipeline = nil; + switch (src2->type) { - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break; - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break; - case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_0_f32]; break; - case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_1_f32]; break; - case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_0_f32]; break; - case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_1_f32]; break; - case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q8_0_f32]; break; - case LM_GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q2_K_f32]; break; - case LM_GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q3_K_f32]; break; - case LM_GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break; - case LM_GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; - case LM_GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32 ].pipeline; break; + case LM_GGML_TYPE_Q8_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32 ].pipeline; break; + case LM_GGML_TYPE_Q2_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q3_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q4_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q5_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32 ].pipeline; break; + case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; + case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; + case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; default: LM_GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1747,14 +1591,15 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; // TODO: how to make this an array? read Metal docs - for (int j = 0; j < n_as; ++j) { - struct lm_ggml_tensor * src_cur = dst->src[2 + j]; + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct lm_ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; id id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); @@ -1764,95 +1609,115 @@ void lm_ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - // TODO: processing one row at a time (ne11 -> 1) is not efficient - [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; int nrows = 1; //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + id pipeline = nil; + // use custom matrix x vector kernel switch (src2t) { case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline; } break; case LM_GGML_TYPE_F16: { LM_GGML_ASSERT(src1t == LM_GGML_TYPE_F32); nth0 = 32; nth1 = 1; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline; } break; case LM_GGML_TYPE_Q4_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline; } break; case LM_GGML_TYPE_Q4_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline; } break; case LM_GGML_TYPE_Q5_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline; } break; case LM_GGML_TYPE_Q5_1: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline; } break; case LM_GGML_TYPE_Q8_0: { nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline; } break; case LM_GGML_TYPE_Q2_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline; } break; case LM_GGML_TYPE_Q3_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline; } break; case LM_GGML_TYPE_Q4_K: { nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline; } break; case LM_GGML_TYPE_Q5_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline; } break; case LM_GGML_TYPE_Q6_K: { nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32]; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ2_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline; + } break; + case LM_GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; } break; default: { - LM_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); + LM_GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); LM_GGML_ASSERT(false && "not implemented"); } }; + if (lm_ggml_is_quantized(src2t)) { + LM_GGML_ASSERT(ne20 >= nth0*nth1); + } + + const int64_t _ne1 = 1; // kernels needs a reference in constant memory + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1877,8 +1742,9 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; // TODO: how to make this an array? read Metal docs - for (int j = 0; j < n_as; ++j) { - struct lm_ggml_tensor * src_cur = dst->src[2 + j]; + for (int j = 0; j < 8; ++j) { + // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 + struct lm_ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; size_t offs_src_cur = 0; id id_src_cur = lm_ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur); @@ -1891,6 +1757,11 @@ void lm_ggml_metal_graph_compute( src2t == LM_GGML_TYPE_Q2_K) { // || src2t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src2t == LM_GGML_TYPE_IQ2_XXS || src2t == LM_GGML_TYPE_IQ2_XS) { + const int mem_size = src2t == LM_GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src2t == LM_GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1914,22 +1785,28 @@ void lm_ggml_metal_graph_compute( } break; case LM_GGML_OP_GET_ROWS: { + id pipeline = nil; + switch (src0->type) { - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break; - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; - case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; - case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; - case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_0]; break; - case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_1]; break; - case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break; - case LM_GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; - case LM_GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; - case LM_GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; - case LM_GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; - case LM_GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F32 ].pipeline; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_F16 ].pipeline; break; + case LM_GGML_TYPE_Q4_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0 ].pipeline; break; + case LM_GGML_TYPE_Q4_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1 ].pipeline; break; + case LM_GGML_TYPE_Q5_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0 ].pipeline; break; + case LM_GGML_TYPE_Q5_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1 ].pipeline; break; + case LM_GGML_TYPE_Q8_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0 ].pipeline; break; + case LM_GGML_TYPE_Q2_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K ].pipeline; break; + case LM_GGML_TYPE_Q3_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K ].pipeline; break; + case LM_GGML_TYPE_Q4_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K ].pipeline; break; + case LM_GGML_TYPE_Q5_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K ].pipeline; break; + case LM_GGML_TYPE_Q6_K: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; + case LM_GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; + case LM_GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case LM_GGML_TYPE_I32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: LM_GGML_ASSERT(false && "not implemented"); } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -1957,7 +1834,9 @@ void lm_ggml_metal_graph_compute( nth *= 2; } - [encoder setComputePipelineState:ctx->pipeline_rms_norm]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -1986,7 +1865,9 @@ void lm_ggml_metal_graph_compute( // nth *= 2; //} - [encoder setComputePipelineState:ctx->pipeline_group_norm]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2008,7 +1889,9 @@ void lm_ggml_metal_graph_compute( const int nth = MIN(256, ne00); - [encoder setComputePipelineState:ctx->pipeline_norm]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_NORM].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2035,7 +1918,9 @@ void lm_ggml_metal_graph_compute( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2080,12 +1965,15 @@ void lm_ggml_metal_graph_compute( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + id pipeline = nil; + switch (src0->type) { - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break; - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break; default: LM_GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; @@ -2148,12 +2036,15 @@ void lm_ggml_metal_graph_compute( const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; + id pipeline = nil; + switch (src0->type) { case LM_GGML_TYPE_F32: LM_GGML_ASSERT(false && "not implemented"); break; - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; default: LM_GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ofs0 length:sizeof( int32_t) atIndex:2]; @@ -2176,7 +2067,9 @@ void lm_ggml_metal_graph_compute( const int sf = dst->op_params[0]; - [encoder setComputePipelineState:ctx->pipeline_upscale_f32]; + const id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -2197,7 +2090,7 @@ void lm_ggml_metal_graph_compute( [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; [encoder setBytes:&sf length:sizeof(sf) atIndex:18]; - const int nth = MIN(1024, ne0); + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; @@ -2205,7 +2098,9 @@ void lm_ggml_metal_graph_compute( { LM_GGML_ASSERT(src0->type == LM_GGML_TYPE_F32); - [encoder setComputePipelineState:ctx->pipeline_pad_f32]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; @@ -2238,12 +2133,15 @@ void lm_ggml_metal_graph_compute( enum lm_ggml_sort_order order = (enum lm_ggml_sort_order) dst->op_params[0]; + id pipeline = nil; + switch (order) { - case LM_GGML_SORT_ASC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_asc]; break; - case LM_GGML_SORT_DESC: [encoder setComputePipelineState:ctx->pipeline_argsort_f32_i32_desc]; break; + case LM_GGML_SORT_ASC: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case LM_GGML_SORT_DESC: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; default: LM_GGML_ASSERT(false); }; + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2257,7 +2155,9 @@ void lm_ggml_metal_graph_compute( float slope; memcpy(&slope, dst->op_params, sizeof(float)); - [encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32]; + id pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&slope length:sizeof(slope) atIndex:2]; @@ -2274,33 +2174,36 @@ void lm_ggml_metal_graph_compute( int nth = MIN(1024, ne00/lm_ggml_blck_size(src0->type)); + id pipeline = nil; + switch (src0t) { case LM_GGML_TYPE_F32: { LM_GGML_ASSERT(ne0 % lm_ggml_blck_size(dst->type) == 0); switch (dstt) { - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break; - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break; - case LM_GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break; - case LM_GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break; - case LM_GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break; - //case LM_GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break; - //case LM_GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline; break; + case LM_GGML_TYPE_Q8_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break; + case LM_GGML_TYPE_Q4_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break; + case LM_GGML_TYPE_Q4_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break; + //case LM_GGML_TYPE_Q5_0: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; + //case LM_GGML_TYPE_Q5_1: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; default: LM_GGML_ASSERT(false && "not implemented"); }; } break; case LM_GGML_TYPE_F16: { switch (dstt) { - case LM_GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; - case LM_GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break; + case LM_GGML_TYPE_F16: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; + case LM_GGML_TYPE_F32: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; default: LM_GGML_ASSERT(false && "not implemented"); }; } break; default: LM_GGML_ASSERT(false && "not implemented"); } + [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; @@ -2328,6 +2231,10 @@ void lm_ggml_metal_graph_compute( LM_GGML_ASSERT(false); } } + +#ifndef LM_GGML_METAL_NDEBUG + [encoder popDebugGroup]; +#endif } if (encoder != nil) { @@ -2350,10 +2257,11 @@ void lm_ggml_metal_graph_compute( MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { LM_GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - LM_GGML_ASSERT(false); + return false; } } + return true; } } @@ -2361,6 +2269,7 @@ void lm_ggml_metal_graph_compute( // backend interface +// default buffer static id g_backend_device = nil; static int g_backend_device_ref_count = 0; @@ -2385,64 +2294,81 @@ static void lm_ggml_backend_metal_free_device(void) { } } -static void * lm_ggml_backend_metal_buffer_get_base(lm_ggml_backend_buffer_t buffer) { - struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; +static const char * lm_ggml_backend_metal_buffer_get_name(lm_ggml_backend_buffer_t buffer) { + return "Metal"; - return ctx->data; + UNUSED(buffer); } static void lm_ggml_backend_metal_buffer_free_buffer(lm_ggml_backend_buffer_t buffer) { struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; - [ctx->metal release]; + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } lm_ggml_backend_metal_free_device(); - free(ctx->data); + if (ctx->owned) { + free(ctx->all_data); + } + free(ctx); +} - UNUSED(buffer); +static void * lm_ggml_backend_metal_buffer_get_base(lm_ggml_backend_buffer_t buffer) { + struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; } static void lm_ggml_backend_metal_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor write out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } static void lm_ggml_backend_metal_buffer_get_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size) { - LM_GGML_ASSERT(offset + size <= lm_ggml_nbytes(tensor) && "tensor read out of bounds"); - LM_GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -static void lm_ggml_backend_metal_buffer_cpy_tensor_from(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_get(src, dst->data, 0, lm_ggml_nbytes(src)); +static bool lm_ggml_backend_metal_buffer_cpy_tensor(lm_ggml_backend_buffer_t buffer, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { + if (lm_ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, lm_ggml_nbytes(src)); + return true; + } + return false; UNUSED(buffer); } -static void lm_ggml_backend_metal_buffer_cpy_tensor_to(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst) { - lm_ggml_backend_tensor_set(dst, src->data, 0, lm_ggml_nbytes(src)); +static void lm_ggml_backend_metal_buffer_clear(lm_ggml_backend_buffer_t buffer, uint8_t value) { + struct lm_ggml_backend_metal_buffer_context * ctx = (struct lm_ggml_backend_metal_buffer_context *)buffer->context; - UNUSED(buffer); + memset(ctx->all_data, value, ctx->all_size); } -static struct lm_ggml_backend_buffer_i metal_backend_buffer_i = { +static struct lm_ggml_backend_buffer_i lm_ggml_backend_metal_buffer_i = { + /* .get_name = */ lm_ggml_backend_metal_buffer_get_name, /* .free_buffer = */ lm_ggml_backend_metal_buffer_free_buffer, /* .get_base = */ lm_ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, /* .set_tensor = */ lm_ggml_backend_metal_buffer_set_tensor, /* .get_tensor = */ lm_ggml_backend_metal_buffer_get_tensor, - /* .cpy_tensor_from = */ lm_ggml_backend_metal_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ lm_ggml_backend_metal_buffer_cpy_tensor_to, + /* .cpy_tensor = */ lm_ggml_backend_metal_buffer_cpy_tensor, + /* .clear = */ lm_ggml_backend_metal_buffer_clear, + /* .reset = */ NULL, }; +// default buffer type + +static const char * lm_ggml_backend_metal_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) { + return "Metal"; + + UNUSED(buft); +} + static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) { struct lm_ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_buffer_context)); @@ -2453,13 +2379,46 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(l size_aligned += (size_page - (size_aligned % size_page)); } - ctx->data = lm_ggml_metal_host_malloc(size); - ctx->metal = [lm_ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + id device = lm_ggml_backend_metal_get_device(); + + ctx->all_data = lm_ggml_metal_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - return lm_ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); + if (ctx->buffers[0].metal == nil) { + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + free(ctx); + lm_ggml_backend_metal_free_device(); + return NULL; + } + + LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + +#if TARGET_OS_OSX + LM_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + LM_GGML_METAL_LOG_INFO("\n"); + } +#else + LM_GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + + return lm_ggml_backend_buffer_init(buft, lm_ggml_backend_metal_buffer_i, ctx, size); } static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) { @@ -2470,16 +2429,24 @@ static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_ggml_backend_bu static bool lm_ggml_backend_metal_buffer_type_supports_backend(lm_ggml_backend_buffer_type_t buft, lm_ggml_backend_t backend) { return lm_ggml_backend_is_metal(backend) || lm_ggml_backend_is_cpu(backend); - LM_GGML_UNUSED(buft); + UNUSED(buft); +} + +static bool lm_ggml_backend_metal_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) { + return true; + + UNUSED(buft); } lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void) { static struct lm_ggml_backend_buffer_type lm_ggml_backend_buffer_type_metal = { /* .iface = */ { + /* .get_name = */ lm_ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ lm_ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ lm_ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to lm_ggml_nbytes /* .supports_backend = */ lm_ggml_backend_metal_buffer_type_supports_backend, + /* .is_host = */ lm_ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, }; @@ -2487,6 +2454,95 @@ lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_buffer_type(void) { return &lm_ggml_backend_buffer_type_metal; } +// buffer from ptr + +lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct lm_ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_buffer_context)); + + ctx->all_data = data; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t) data % size_page; + data = (void *) ((char *) data - offs); + size += offs; + } + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + id device = lm_ggml_backend_metal_get_device(); + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } + + LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + LM_GGML_METAL_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + +#if TARGET_OS_OSX + LM_GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + LM_GGML_METAL_LOG_INFO("\n"); + } +#else + LM_GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + return lm_ggml_backend_buffer_init(lm_ggml_backend_metal_buffer_type(), lm_ggml_backend_metal_buffer_i, ctx, size); +} + +// backend + static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) { return "Metal"; @@ -2499,55 +2555,45 @@ static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) { free(backend); } -static void lm_ggml_backend_metal_synchronize(lm_ggml_backend_t backend) { - UNUSED(backend); -} - static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_default_buffer_type(lm_ggml_backend_t backend) { return lm_ggml_backend_metal_buffer_type(); UNUSED(backend); } -static void lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { +static bool lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) { struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context; - lm_ggml_metal_graph_compute(metal_ctx, cgraph); + return lm_ggml_metal_graph_compute(metal_ctx, cgraph); } static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) { - return lm_ggml_metal_supports_op(op); + struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context; - UNUSED(backend); + return lm_ggml_metal_supports_op(metal_ctx, op); } -static struct lm_ggml_backend_i metal_backend_i = { +static struct lm_ggml_backend_i lm_ggml_backend_metal_i = { /* .get_name = */ lm_ggml_backend_metal_name, /* .free = */ lm_ggml_backend_metal_free, /* .get_default_buffer_type = */ lm_ggml_backend_metal_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, - /* .synchronize = */ lm_ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ lm_ggml_backend_metal_graph_compute, /* .supports_op = */ lm_ggml_backend_metal_supports_op, }; -// TODO: make a common log callback for all backends in ggml-backend -static void lm_ggml_backend_log_callback(enum lm_ggml_log_level level, const char * msg, void * user_data) { - fprintf(stderr, "%s", msg); - - UNUSED(level); - UNUSED(user_data); +void lm_ggml_backend_metal_log_set_callback(lm_ggml_log_callback log_callback, void * user_data) { + lm_ggml_metal_log_callback = log_callback; + lm_ggml_metal_log_user_data = user_data; } lm_ggml_backend_t lm_ggml_backend_metal_init(void) { - lm_ggml_metal_log_set_callback(lm_ggml_backend_log_callback, NULL); - struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS); if (ctx == NULL) { @@ -2557,7 +2603,7 @@ lm_ggml_backend_t lm_ggml_backend_metal_init(void) { lm_ggml_backend_t metal_backend = malloc(sizeof(struct lm_ggml_backend)); *metal_backend = (struct lm_ggml_backend) { - /* .interface = */ metal_backend_i, + /* .interface = */ lm_ggml_backend_metal_i, /* .context = */ ctx, }; @@ -2565,7 +2611,7 @@ lm_ggml_backend_t lm_ggml_backend_metal_init(void) { } bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) { - return backend->iface.get_name == lm_ggml_backend_metal_name; + return backend && backend->iface.get_name == lm_ggml_backend_metal_name; } void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) { @@ -2573,7 +2619,7 @@ void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) { struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context; - lm_ggml_metal_set_n_cb(ctx, n_cb); + ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS); } bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) { diff --git a/cpp/ggml-quants.c b/cpp/ggml-quants.c index 260fca09..7af031d1 100644 --- a/cpp/ggml-quants.c +++ b/cpp/ggml-quants.c @@ -5,6 +5,8 @@ #include #include #include +#include // for qsort +#include // for LM_GGML_ASSERT #ifdef __ARM_NEON @@ -272,10 +274,13 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 // vaddvq_s16 // vpaddq_s16 +// vpaddq_s32 // vaddvq_s32 // vaddvq_f32 // vmaxvq_f32 // vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 inline static int32_t vaddvq_s16(int16x8_t v) { return @@ -291,6 +296,12 @@ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { return vcombine_s16(a0, b0); } +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -316,6 +327,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { return res; } +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + // vld1q_s16_x2 // vld1q_u8_x2 // vld1q_u8_x4 @@ -407,6 +440,22 @@ inline static lm_ggml_int8x16x4_t lm_ggml_vld1q_s8_x4(const int8_t * ptr) { #define lm_ggml_vld1q_s8_x4 vld1q_s8_x4 #endif + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define lm_ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif + #endif #if defined(__ARM_NEON) || defined(__wasm_simd128__) @@ -1195,7 +1244,8 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type, + const float * restrict qw) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1221,14 +1271,13 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * rmse_type = -rmse_type; return_early = true; } - int weight_type = rmse_type%2; float sumlx = 0; float suml2 = 0; for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); l = MAX(-nmax, MIN(nmax-1, l)); L[i] = l + nmax; - float w = weight_type == 1 ? x[i] * x[i] : 1; + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); sumlx += w*x[i]*l; suml2 += w*l*l; } @@ -1244,7 +1293,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * for (int i = 0; i < n; ++i) { int l = nearest_int(iscale * x[i]); l = MAX(-nmax, MIN(nmax-1, l)); - float w = weight_type == 1 ? x[i] * x[i] : 1; + float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); sumlx += w*x[i]*l; suml2 += w*l*l; } @@ -1592,6 +1641,241 @@ size_t lm_ggml_quantize_q2_K(const float * restrict src, void * restrict dst, in return (n/QK_K*sizeof(block_q2_K)); } +static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights ? weights[0] : x[0]*x[0]; + float sum_x = sum_w * x[0]; + for (int i = 1; i < n; ++i) { + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights ? weights[i] : x[i]*x[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) { + min = 0; + } + if (max <= min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff*diff; + float w = weights ? weights[i] : x[i]*x[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights ? weights[i] : x[i]*x[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff*diff; + float w = weights ? weights[i] : x[i]*x[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + +static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) { + float max = 0; + for (int i = 0; i < n; ++i) { + max = MAX(max, x[i]); + } + if (!max) { // all zero + for (int i = 0; i < n; ++i) { L[i] = 0; } + return 0.f; + } + float iscale = nmax / max; + for (int i = 0; i < n; ++i) { + L[i] = nearest_int(iscale * x[i]); + } + float scale = 1/iscale; + float best_mse = 0; + for (int i = 0; i < n; ++i) { + float diff = x[i] - scale*L[i]; + float w = quant_weights[i]; + best_mse += w*diff*diff; + } + for (int is = -4; is <= 4; ++is) { + if (is == 0) continue; + float iscale_is = (0.1f*is + nmax)/max; + float scale_is = 1/iscale_is; + float mse = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale_is*x[i]); + l = MIN(nmax, l); + float diff = x[i] - scale_is*l; + float w = quant_weights[i]; + mse += w*diff*diff; + } + if (mse < best_mse) { + best_mse = mse; + iscale = iscale_is; + } + } + float sumlx = 0; + float suml2 = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale * x[i]); + l = MIN(nmax, l); + L[i] = l; + float w = quant_weights[i]; + sumlx += w*x[i]*l; + suml2 += w*l*l; + } + for (int itry = 0; itry < 5; ++itry) { + int n_changed = 0; + for (int i = 0; i < n; ++i) { + float w = quant_weights[i]; + float slx = sumlx - w*x[i]*L[i]; + float sl2 = suml2 - w*L[i]*L[i]; + if (slx > 0 && sl2 > 0) { + int new_l = nearest_int(x[i] * sl2 / slx); + new_l = MIN(nmax, new_l); + if (new_l != L[i]) { + slx += w*x[i]*new_l; + sl2 += w*new_l*new_l; + if (slx*slx*suml2 > sumlx*sumlx*sl2) { + L[i] = new_l; sumlx = slx; suml2 = sl2; + ++n_changed; + } + } + } + } + if (!n_changed) { + break; + } + } + return sumlx / suml2; +} + +static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) { + LM_GGML_ASSERT(quant_weights); + assert(k % QK_K == 0); + const int nb = k / QK_K; + const bool requantize = true; + + uint8_t L[QK_K]; + uint8_t Laux[16]; + float mins[QK_K/16]; + float scales[QK_K/16]; + float sw[QK_K/16]; + float weight[QK_K/16]; + uint8_t Ls[QK_K/16], Lm[QK_K/16]; + + for (int i = 0; i < nb; i++) { + memset(sw, 0, QK_K/16*sizeof(float)); + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j]; + float sigma2 = sumx2/QK_K; + for (int j = 0; j < QK_K/16; ++j) { + const float * restrict qw = quant_weights + QK_K * i + 16*j; + for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]); + for (int l = 0; l < 16; ++l) sw[j] += weight[l]; + scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + } + + float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw); + float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw); + y[i].d = LM_GGML_FP32_TO_FP16(dm); + y[i].dmin = LM_GGML_FP32_TO_FP16(mm); + dm = LM_GGML_FP16_TO_FP32(y[i].d); + mm = LM_GGML_FP16_TO_FP32(y[i].dmin); + + for (int j = 0; j < QK_K/16; ++j) { + y[i].scales[j] = Ls[j] | (Lm[j] << 4); + } + + if (requantize) { + for (int j = 0; j < QK_K/16; ++j) { + const float d = dm * (y[i].scales[j] & 0xF); + if (!d) continue; + const float m = mm * (y[i].scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int((x[16*j + ii] + m)/d); + l = MAX(0, MIN(3, l)); + L[16*j + ii] = l; + } + } + } + +#if QK_K == 256 + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } +#else + for (int l = 0; l < 16; ++l) { + y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); + } +#endif + + x += QK_K; + + } +} + +size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = lm_ggml_row_size(LM_GGML_TYPE_Q2_K, n_per_row); + if (!quant_weights) { + quantize_row_q2_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + //========================= 3-bit (de)-quantization void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { @@ -1805,6 +2089,112 @@ size_t lm_ggml_quantize_q3_K(const float * restrict src, void * restrict dst, in return (n/QK_K*sizeof(block_q3_K)); } +static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q3_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + int8_t L[QK_K]; + float scales[QK_K / 16]; + float weight[16]; + float sw[QK_K / 16]; + int8_t Ls[QK_K / 16]; + + for (int i = 0; i < nb; i++) { + + float sumx2 = 0; + for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j]; + float sigma2 = 2*sumx2/QK_K; + + for (int j = 0; j < QK_K/16; ++j) { + if (quant_weights) { + const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL; + for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]); + } else { + for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l]; + } + float sumw = 0; + for (int l = 0; l < 16; ++l) sumw += weight[l]; + sw[j] = sumw; + + scales[j] = make_qx_quants(16, 4, x + 16*j, L + 16*j, 1, weight); + + } + + memset(y[i].scales, 0, 12); + + float d_block = make_qx_quants(QK_K/16, 32, scales, Ls, 1, sw); + for (int j = 0; j < QK_K/16; ++j) { + int l = Ls[j]; + if (j < 8) { + y[i].scales[j] = l & 0xF; + } else { + y[i].scales[j-8] |= ((l & 0xF) << 4); + } + l >>= 4; + y[i].scales[j%4 + 8] |= (l << (2*(j/4))); + } + y[i].d = LM_GGML_FP32_TO_FP16(d_block); + + int8_t sc; + for (int j = 0; j < QK_K/16; ++j) { + sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; + sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; + float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + L[16*j + ii] = l + 4; + } + } + + memset(y[i].hmask, 0, QK_K/8); + // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. + int m = 0; + uint8_t hm = 1; + for (int j = 0; j < QK_K; ++j) { + if (L[j] > 3) { + y[i].hmask[m] |= hm; + L[j] -= 4; + } + if (++m == QK_K/8) { + m = 0; hm <<= 1; + } + } + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); + } + } + + x += QK_K; + } +#endif +} + +size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = lm_ggml_row_size(LM_GGML_TYPE_Q3_K, n_per_row); + if (!quant_weights) { + quantize_row_q3_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== 4-bit (de)-quantization void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { @@ -1970,36 +2360,38 @@ size_t lm_ggml_quantize_q4_K(const float * restrict src, void * restrict dst, in return (n/QK_K*sizeof(block_q4_K)); } -// ====================== 5-bit (de)-quantization - -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; +static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q4_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; -#if QK_K == 256 uint8_t L[QK_K]; + uint8_t Laux[32]; + float weights[32]; float mins[QK_K/32]; float scales[QK_K/32]; - float weights[32]; - uint8_t Laux[32]; -#else - int8_t L[QK_K]; - float scales[QK_K/16]; -#endif for (int i = 0; i < nb; i++) { -#if QK_K == 256 + float sum_x2 = 0; + for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; + float sigma2 = sum_x2/QK_K; + float av_x = sqrtf(sigma2); float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 32*j; + for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]); + } else { + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + } + scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); float scale = scales[j]; if (scale > max_scale) { max_scale = scale; @@ -2037,13 +2429,113 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * m; for (int ii = 0; ii < 32; ++ii) { int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(31, l)); + l = MAX(0, MIN(15, l)); L[32*j + ii] = l; } } - - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * q = y[i].qs; + for (int j = 0; j < QK_K; j += 64) { + for (int l = 0; l < 32; ++l) q[l] = L[j + l] | (L[j + l + 32] << 4); + q += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = lm_ggml_row_size(LM_GGML_TYPE_Q4_K, n_per_row); + if (!quant_weights) { + quantize_row_q4_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + +// ====================== 5-bit (de)-quantization + +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + +#if QK_K == 256 + uint8_t L[QK_K]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; +#else + int8_t L[QK_K]; + float scales[QK_K/16]; +#endif + + for (int i = 0; i < nb; i++) { + +#if QK_K == 256 + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(31, l)); + L[32*j + ii] = l; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -2065,7 +2557,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #else float max_scale = 0, amax = 0; for (int j = 0; j < QK_K/16; ++j) { - scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1); + scales[j] = make_qx_quants(16, 16, x + 16*j, L + 16*j, 1, NULL); float abs_scale = fabsf(scales[j]); if (abs_scale > amax) { amax = abs_scale; @@ -2176,6 +2668,123 @@ size_t lm_ggml_quantize_q5_K(const float * restrict src, void * restrict dst, in return (n/QK_K*sizeof(block_q5_K)); } +static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q5_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + uint8_t L[QK_K]; + float mins[QK_K/32]; + float scales[QK_K/32]; + float weights[32]; + uint8_t Laux[32]; + + for (int i = 0; i < nb; i++) { + + float sum_x2 = 0; + for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l]; + float sigma2 = sum_x2/QK_K; + float av_x = sqrtf(sigma2); + + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 32*j; + for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]); + } else { + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + } + scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + + float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; + float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + for (int j = 0; j < QK_K/32; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + ls = MIN(63, ls); + lm = MIN(63, lm); + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } + } + y[i].d = LM_GGML_FP32_TO_FP16(max_scale/63.f); + y[i].dmin = LM_GGML_FP32_TO_FP16(max_min/63.f); + + uint8_t sc, m; + for (int j = 0; j < QK_K/32; ++j) { + get_scale_min_k4(j, y[i].scales, &sc, &m); + const float d = LM_GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) continue; + const float dm = LM_GGML_FP16_TO_FP32(y[i].dmin) * m; + for (int ii = 0; ii < 32; ++ii) { + int l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(31, l)); + L[32*j + ii] = l; + } + } + + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); + + uint8_t m1 = 1, m2 = 2; + for (int n = 0; n < QK_K; n += 64) { + for (int j = 0; j < 32; ++j) { + int l1 = L[n + j]; + if (l1 > 15) { + l1 -= 16; qh[j] |= m1; + } + int l2 = L[n + j + 32]; + if (l2 > 15) { + l2 -= 16; qh[j] |= m2; + } + ql[j] = l1 | (l2 << 4); + } + m1 <<= 2; m2 <<= 2; + ql += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = lm_ggml_row_size(LM_GGML_TYPE_Q5_K, n_per_row); + if (!quant_weights) { + quantize_row_q5_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + // ====================== 6-bit (de)-quantization void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { @@ -2192,7 +2801,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict for (int ib = 0; ib < QK_K/16; ++ib) { - const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1); + const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); scales[ib] = scale; const float abs_scale = fabsf(scale); @@ -2324,6 +2933,378 @@ size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_ return (n/QK_K*sizeof(block_q6_K)); } +static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) { +#if QK_K != 256 + (void)quant_weights; + quantize_row_q6_K_reference(x, y, n_per_row); +#else + assert(n_per_row % QK_K == 0); + const int nb = n_per_row / QK_K; + + int8_t L[QK_K]; + float scales[QK_K/16]; + //float weights[16]; + + for (int i = 0; i < nb; i++) { + + //float sum_x2 = 0; + //for (int j = 0; j < QK_K; ++j) sum_x2 += x[j]*x[j]; + //float sigma2 = sum_x2/QK_K; + + float max_scale = 0; + float max_abs_scale = 0; + + for (int ib = 0; ib < QK_K/16; ++ib) { + + float scale; + if (quant_weights) { + const float * qw = quant_weights + QK_K*i + 16*ib; + //for (int j = 0; j < 16; ++j) weights[j] = qw[j] * sqrtf(sigma2 + x[16*ib + j]*x[16*ib + j]); + //scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, weights); + scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, qw); + } else { + scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); + } + scales[ib] = scale; + + const float abs_scale = fabsf(scale); + if (abs_scale > max_abs_scale) { + max_abs_scale = abs_scale; + max_scale = scale; + } + + } + + if (!max_abs_scale) { + memset(&y[i], 0, sizeof(block_q6_K)); + y[i].d = LM_GGML_FP32_TO_FP16(0.f); + x += QK_K; + continue; + } + + float iscale = -128.f/max_scale; + y[i].d = LM_GGML_FP32_TO_FP16(1/iscale); + for (int ib = 0; ib < QK_K/16; ++ib) { + y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); + } + + for (int j = 0; j < QK_K/16; ++j) { + float d = LM_GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) { + continue; + } + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); + L[16*j + ii] = l + 32; + } + } + + uint8_t * restrict ql = y[i].ql; + uint8_t * restrict qh = y[i].qh; + for (int j = 0; j < QK_K; j += 128) { + for (int l = 0; l < 32; ++l) { + const uint8_t q1 = L[j + l + 0] & 0xF; + const uint8_t q2 = L[j + l + 32] & 0xF; + const uint8_t q3 = L[j + l + 64] & 0xF; + const uint8_t q4 = L[j + l + 96] & 0xF; + ql[l+ 0] = q1 | (q3 << 4); + ql[l+32] = q2 | (q4 << 4); + qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); + } + ql += 64; + qh += 32; + } + + x += QK_K; + + } +#endif +} + +size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + int row_size = lm_ggml_row_size(LM_GGML_TYPE_Q6_K, n_per_row); + if (!quant_weights) { + quantize_row_q6_K_reference(src, dst, nrow*n_per_row); + } + else { + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + +// ====================== "True" 2-bit (de)-quantization + +static const uint64_t iq2xxs_grid[256] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, + 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, + 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, + 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, + 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, + 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, + 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, + 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, + 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, + 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, + 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, + 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, + 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, + 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, + 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, + 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, + 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, + 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, + 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, + 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, + 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, + 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, + 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, + 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, + 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, + 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, + 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, + 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, + 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, + 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, + 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, + 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, + 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, + 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, + 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, + 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, + 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, + 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, + 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, + 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, + 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, + 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, + 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, + 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, + 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, + 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, + 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, + 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, + 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, + 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, + 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, + 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, + 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, + 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, + 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, + 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, + 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, + 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, + 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, + 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, + 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, +}; + +static const uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + +static const uint8_t ksigns_iq2xs[128] = { + 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, + 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, + 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, + 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, + 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, + 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, + 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, + 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, +}; + +static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; + +void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t)); + const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + y += 8; + } + } + } +} + +// ====================== 2.3125 bpw (de)-quantization + +void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float db[2]; + + for (int i = 0; i < nb; i++) { + + const float d = LM_GGML_FP16_TO_FP32(x[i].d); + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f; + db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511)); + const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9]; + for (int j = 0; j < 8; ++j) { + y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + y += 8; + } + } + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -2346,7 +3327,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict x += QK_K; continue; } - const float iscale = -128.f/max; + //const float iscale = -128.f/max; + // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward + const float iscale = -127.f/max; for (int j = 0; j < QK_K; ++j) { int v = nearest_int(iscale*x[j]); y[i].qs[j] = MIN(127, v); @@ -2468,32 +3451,12 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + const int32x4_t p_0 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -2776,32 +3739,12 @@ void lm_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * res const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + const int32x4_t p_0 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; @@ -2963,32 +3906,12 @@ void lm_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * res const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3275,32 +4198,12 @@ void lm_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * res const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); - - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); - - const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); - const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); - const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); - const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), LM_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); -#endif + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + lm_ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), LM_GGML_FP16_TO_FP32(x1->d)*y1->d); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -3550,34 +4453,13 @@ void lm_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * res const int8x16_t y1_0 = vld1q_s8(y1->qs); const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); -#if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); + lm_ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + lm_ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( - vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); - -#else - const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); - const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); - const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); - const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); - - const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); - const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); - const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); - const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); - - const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); - const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); - const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); - const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), LM_GGML_FP16_TO_FP32(x0->d)*LM_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); -#endif + lm_ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + lm_ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), LM_GGML_FP16_TO_FP32(x1->d)*LM_GGML_FP16_TO_FP32(y1->d)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -3650,12 +4532,10 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); const uint8x16_t m4 = vdupq_n_u8(0xF); -#if defined(__ARM_FEATURE_DOTPROD) - const int32x4_t vzero = vdupq_n_s32(0); -#endif + + const int32x4_t vzero = vdupq_n_s32(0); lm_ggml_int8x16x2_t q2bytes; uint8_t aux[16]; @@ -3663,7 +4543,6 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * LM_GGML_FP16_TO_FP32(x[i].dmin); @@ -3677,7 +4556,7 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums); - const lm_ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}; + const lm_ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), @@ -3689,20 +4568,9 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res // We use this macro instead of a function call because for some reason // the code runs 2-3% slower, even if the function is declared inline -#if defined(__ARM_FEATURE_DOTPROD) -#define MULTIPLY_ACCUM_WITH_SCALE(index)\ - isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ - isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; -#else #define MULTIPLY_ACCUM_WITH_SCALE(index)\ - {\ - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\ - vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\ - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\ - vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\ - isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\ - } -#endif + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32;\ @@ -3710,26 +4578,23 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ MULTIPLY_ACCUM_WITH_SCALE((index)); - for (int j = 0; j < QK_K/128; ++j) { - const lm_ggml_uint8x16x2_t q2bits = lm_ggml_vld1q_u8_x2(q2); q2 += 32; lm_ggml_int8x16x2_t q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); + MULTIPLY_ACCUM_WITH_SCALE(0); SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4); - SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6); is += 8; } - sum += d * isum; + sum += d * isum; } *s = sum; @@ -4043,11 +4908,9 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m3 = vdupq_n_u8(0x3); -#if defined(__ARM_FEATURE_DOTPROD) - const int32x4_t vzero = vdupq_n_s32(0); -#endif + + const int32x4_t vzero = vdupq_n_s32(0); lm_ggml_int8x16x4_t q2bytes; @@ -4081,28 +4944,12 @@ void lm_ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * res q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3)); q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3)); -#if defined(__ARM_FEATURE_DOTPROD) - isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; - isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; - isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; - isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; -#else - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum1 += vaddvq_s16(p1) * scales[0]; - isum2 += vaddvq_s16(p2) * scales[1]; - - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum1 += vaddvq_s16(p3) * scales[2]; - isum2 += vaddvq_s16(p4) * scales[3]; -#endif - sum += d * (isum1 + isum2); + isum1 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0]; + isum2 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1]; + isum1 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2]; + isum2 += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3]; + sum += d * (isum1 + isum2); } *s = sum; @@ -4328,9 +5175,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res uint32_t utmp[4]; const uint8x16_t m3b = vdupq_n_u8(0x3); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t vzero = vdupq_n_s32(0); -#endif const uint8x16_t m0 = vdupq_n_u8(1); const uint8x16_t m1 = vshlq_n_u8(m0, 1); @@ -4382,22 +5227,11 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; -#else - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1]))); - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; + scale += 4; q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); @@ -4410,22 +5244,11 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; -#else - p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0]))); - p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1]))); - p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2]))); - p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; + scale += 4; if (j == 0) { @@ -4864,10 +5687,7 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - -#ifdef __ARM_FEATURE_DOTPROD - const int32x4_t vzero = vdupq_n_s32(0); -#endif + const int32x4_t vzero = vdupq_n_s32(0); const uint8x16_t m3b = vdupq_n_u8(0x3); const uint8x16_t mh = vdupq_n_u8(4); @@ -4908,22 +5728,10 @@ void lm_ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * res q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2])); q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; - isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; -#else - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3]; -#endif + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3]; sum += d * isum; @@ -5228,11 +6036,8 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res uint32_t utmp[4]; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t mzero = vdupq_n_s32(0); -#endif lm_ggml_int8x16x2_t q4bytes; lm_ggml_int8x16x2_t q8bytes; @@ -5269,44 +6074,22 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res int32_t sumi2 = 0; for (int j = 0; j < QK_K/64; ++j) { - const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); q4 += 32; -#ifdef __ARM_FEATURE_DOTPROD q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi1 += vaddvq_s32(p1) * scales[2*j+0]; q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); sumi2 += vaddvq_s32(p2) * scales[2*j+1]; -#else - q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0]; - - q8bytes = lm_ggml_vld1q_s8_x2(q8); q8 += 32; - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1]; - -#endif } sumf += d * (sumi1 + sumi2); @@ -5603,12 +6386,9 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); -#ifdef __ARM_FEATURE_DOTPROD const int32x4_t mzero = vdupq_n_s32(0); -#endif float sumf = 0; @@ -5636,41 +6416,20 @@ void lm_ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * res const lm_ggml_uint8x16x2_t q4bits = lm_ggml_vld1q_u8_x2(q4); -#ifdef __ARM_FEATURE_DOTPROD - q8bytes = lm_ggml_vld1q_s8_x4(q8); - q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); - q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - - const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); - const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; - - q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); - q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - - const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); - const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; - -#else q8bytes = lm_ggml_vld1q_s8_x4(q8); q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b)); q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b)); - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0]; + + const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]); + const int32_t sumi1 = vaddvq_s32(p1) * scales[0]; q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4)); q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4)); - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3]))); - int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1]; -#endif - sumf += d * (sumi1 + sumi2); + const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]); + const int32_t sumi2 = vaddvq_s32(p2) * scales[1]; + sumf += d * (sumi1 + sumi2); } *s = sumf - sum_mins; @@ -5875,15 +6634,11 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res uint32_t utmp[4]; - #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t mone = vdupq_n_u8(1); const uint8x16_t mtwo = vdupq_n_u8(2); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t mzero = vdupq_n_s32(0); -#endif lm_ggml_int8x16x4_t q5bytes; @@ -5938,28 +6693,11 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2])); q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - - sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; - sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; -#else - - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++; - - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++; -#endif + sumi += vaddvq_s32(lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++; + sumi += vaddvq_s32(lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++; } sumf += d * sumi - dmin * sumi_mins; - } *s = sumf; @@ -6311,12 +7049,9 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t mh = vdupq_n_u8(16); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t mzero = vdupq_n_s32(0); -#endif lm_ggml_int8x16x4_t q5bytes; lm_ggml_uint8x16x4_t q5h; @@ -6348,32 +7083,12 @@ void lm_ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * res q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2])); q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - - int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); - int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); - int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); - int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); + int32_t sumi1 = sc[0] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0])); + int32_t sumi2 = sc[1] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1])); + int32_t sumi3 = sc[2] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2])); + int32_t sumi4 = sc[3] * vaddvq_s32(lm_ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3])); sumf += d * (sumi1 + sumi2 + sumi3 + sumi4); - -#else - - const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1); - - const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3); - - sumf += d*sumi; -#endif - } *s = sumf; @@ -6600,13 +7315,10 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); -#endif //const int8x16_t m32s = vdupq_n_s8(32); const uint8x16_t mone = vdupq_n_u8(3); @@ -6626,7 +7338,7 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const lm_ggml_int16x8x2_t q8sums = lm_ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); - const lm_ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}; + const lm_ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))), @@ -6658,31 +7370,13 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; scale += 4; -#else - - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - scale += 2; - - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1]; - scale += 2; -#endif - q8bytes = lm_ggml_vld1q_s8_x4(q8); q8 += 64; shifted = vshrq_n_u8(qhbits.val[0], 4); @@ -6703,34 +7397,11 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); -#if defined(__ARM_FEATURE_DOTPROD) - - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; scale += 4; - - //for (int l = 0; l < 4; ++l) { - // const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]); - // isum += vaddvq_s32(p) * *scale++; - //} -#else - p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - scale += 2; - - p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1]; - scale += 2; -#endif - } //sum += isum * d_all * y[i].d; sum += d_all * y[i].d * (isum - 32 * isum_mins); @@ -7076,14 +7747,11 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res const int nb = n / QK_K; #ifdef __ARM_NEON - float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); const int8x16_t m32s = vdupq_n_s8(32); -#if defined(__ARM_FEATURE_DOTPROD) const int32x4_t vzero = vdupq_n_s32(0); -#endif const uint8x16_t mone = vdupq_n_u8(3); @@ -7119,26 +7787,10 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s); q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s); -#if defined(__ARM_FEATURE_DOTPROD) - - isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + - vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; -#else - - int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])), - vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0]))); - int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])), - vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1]))); - isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1]; - - int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])), - vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2]))); - int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])), - vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3]))); - isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3]; -#endif + isum += vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + + vaddvq_s32(lm_ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; sum += isum * d_all * y[i].d; @@ -7380,3 +8032,982 @@ void lm_ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * res } #endif + +static const int8_t keven_signs_q2xs[1024] = { + 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, + 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, + 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, + 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, + 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, + 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, + 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, + 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, + 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, + 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, + 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, + 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, + 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, + 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, + 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, + 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, + 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, + 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, + 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, + 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, + 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, + 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, + 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, + 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, + 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, + 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, + 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, + 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, + 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, + 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, + 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, + 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +void lm_ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq2_xxs * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + lm_ggml_int8x16x4_t q2u; + lm_ggml_int8x16x4_t q2s; + lm_ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + float sumf1 = 0, sumf2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1]))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3]))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9]))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11]))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]); + const int32x4_t p2 = lm_ggml_vdotq_s32(lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]); + sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28)); + sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28)); + } + sumf += d*(sumf1 + sumf2); + } + *s = 0.25f * sumf; + +#elif defined(__AVX2__) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[4]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8; + const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]); + const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127], + signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = aux32[1] >> 28; + const uint16_t ls2 = aux32[3] >> 28; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + uint32_t aux32[2]; + const uint8_t * aux8 = (const uint8_t *)aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(aux32, q2, 2*sizeof(uint32_t)); + q2 += 4; + const uint32_t ls = 2*(aux32[1] >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +void lm_ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq2_xs * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + lm_ggml_int8x16x4_t q2u; + lm_ggml_int8x16x4_t q2s; + lm_ggml_int8x16x4_t q8b; + + int32x4x4_t scales32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const uint8x8_t scales8 = vld1_u8(x[i].scales); + const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); + const uint8x8_t scales_h = vshr_n_u8(scales8, 4); + uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1)); + const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales)); + const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales)); + scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1))); + scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1))); + scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2))); + scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); + int32x4_t sumi = vdupq_n_s32(0); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + q8b = lm_ggml_vld1q_s8_x4(q8); q8 += 64; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511)))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]); + const int32x4_t p2 = lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]); + const int32x4_t p3 = lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]); + const int32x4_t p4 = lm_ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]); + const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4)); + sumi = vmlaq_s32(sumi, p, scales32.val[ib64]); + q2 += 8; + } + sumf += d*vaddvq_s32(sumi); + } + *s = 0.125f * sumf; + +#elif defined(__AVX2__) + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + const __m128i m511 = _mm_set1_epi16(511); + const __m128i m127 = _mm_set1_epi16(127); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m128i aux_gindex, aux_sindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + const uint16_t * sindex = (const uint16_t *)&aux_sindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8; + aux_gindex = _mm_and_si128(q2_data, m511); + aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127); + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); + const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + + const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = LM_GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} + +// ================================ IQ2 quantization ============================================= + +typedef struct { + uint64_t * grid; + int * map; + uint16_t * neighbours; +} iq2_entry_t; + +static iq2_entry_t iq2_data[2] = { + {NULL, NULL, NULL}, + {NULL, NULL, NULL}, +}; + +static inline int iq2_data_index(int grid_size) { + LM_GGML_ASSERT(grid_size == 256 || grid_size == 512); + return grid_size == 256 ? 0 : 1; +} + +static int iq2_compare_func(const void * left, const void * right) { + const int * l = (const int *)left; + const int * r = (const int *)right; + return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; +} + +static void q2xs_init_impl(int grid_size) { + const int gindex = iq2_data_index(grid_size); + if (iq2_data[gindex].grid) { + return; + } + static const uint16_t kgrid_256[256] = { + 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97, + 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642, + 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288, + 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113, + 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240, + 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400, + 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260, + 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872, + 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516, + 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561, + 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488, + 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545, + 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874, + 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856, + 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142, + 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268, + }; + static const uint16_t kgrid_512[512] = { + 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, + 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257, + 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340, + 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597, + 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096, + 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348, + 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065, + 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441, + 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160, + 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372, + 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125, + 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652, + 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197, + 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549, + 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894, + 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388, + 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480, + 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773, + 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473, + 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436, + 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497, + 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162, + 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528, + 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745, + 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234, + 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025, + 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810, + 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984, + 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462, + 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960, + 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048, + 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690, + }; + const int kmap_size = 43692; + const int nwant = 2; + const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; + uint64_t * kgrid_q2xs; + int * kmap_q2xs; + uint16_t * kneighbors_q2xs; + + printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t)); + for (int k = 0; k < grid_size; ++k) { + int8_t * pos = (int8_t *)(the_grid + k); + for (int i = 0; i < 8; ++i) { + int l = (kgrid[k] >> 2*i) & 0x3; + pos[i] = 2*l + 1; + } + } + kgrid_q2xs = the_grid; + iq2_data[gindex].grid = the_grid; + kmap_q2xs = (int *)malloc(kmap_size*sizeof(int)); + iq2_data[gindex].map = kmap_q2xs; + for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1; + uint64_t aux64; + uint8_t * aux8 = (uint8_t *)&aux64; + for (int i = 0; i < grid_size; ++i) { + aux64 = kgrid_q2xs[i]; + uint16_t index = 0; + for (int k=0; k<8; ++k) { + uint16_t q = (aux8[k] - 1)/2; + index |= (q << 2*k); + } + kmap_q2xs[index] = i; + } + int8_t pos[8]; + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + int num_neighbors = 0, num_not_in_map = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + ++num_not_in_map; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + num_neighbors += n; + } + printf("%s: %d neighbours in total\n", __func__, num_neighbors); + kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); + iq2_data[gindex].neighbours = kneighbors_q2xs; + int counter = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + kmap_q2xs[i] = -(counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q2xs[counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q2xs[counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); +} + +void lm_ggml_init_iq2_quantization(enum lm_ggml_type type) { + if (type == LM_GGML_TYPE_IQ2_XXS) { + q2xs_init_impl(256); + } + else if (type == LM_GGML_TYPE_IQ2_XS) { + q2xs_init_impl(512); + } + else { + fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type); + } +} + +static void q2xs_deinit_impl(int grid_size) { + LM_GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024); + const int gindex = iq2_data_index(grid_size); + if (iq2_data[gindex].grid) { + free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; + free(iq2_data[gindex].map); iq2_data[gindex].map = NULL; + free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL; + } +} + +void lm_ggml_deinit_iq2_quantization(enum lm_ggml_type type) { + if (type == LM_GGML_TYPE_IQ2_XXS) { + q2xs_deinit_impl(256); + } + else if (type == LM_GGML_TYPE_IQ2_XS) { + q2xs_deinit_impl(512); + } + else { + fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type); + } +} + +static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, + const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { + int num_neighbors = neighbours[0]; + LM_GGML_ASSERT(num_neighbors > 0); + float best_d2 = FLT_MAX; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float d2 = 0; + for (int i = 0; i < 8; ++i) { + float q = pg[i]; + float diff = scale*q - xval[i]; + d2 += weight[i]*diff*diff; + } + if (d2 < best_d2) { + best_d2 = d2; grid_index = neighbours[j]; + } + } + LM_GGML_ASSERT(grid_index >= 0); + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(256); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + LM_GGML_ASSERT(quant_weights); + LM_GGML_ASSERT(kgrid_q2xs); + LM_GGML_ASSERT(kmap_q2xs); + LM_GGML_ASSERT(kneighbors_q2xs); + LM_GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 3; + + const int nbl = n/256; + + block_iq2_xxs * y = vy; + + float scales[QK_K/32]; + float weight[32]; + float xval[32]; + int8_t L[32]; + int8_t Laux[32]; + float waux[32]; + bool is_on_grid[4]; + bool is_on_grid_aux[4]; + uint8_t block_signs[4]; + uint32_t q2[2*(QK_K/32)]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = LM_GGML_FP32_TO_FP16(0.f); + memset(q2, 0, QK_K/4); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const float * xb = xbl + 32*ib; + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 4; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 32); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -9; is <= 9; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/max; + float this_scale = 1/id; + for (int k = 0; k < 4; ++k) { + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 32; ++i) L[i] = Laux[i]; + for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 4; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 2*i); + } + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index); + for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 4; ++k) { + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); + printf("\n"); + LM_GGML_ASSERT(false); + } + q2[2*ib+0] |= (grid_index << 8*k); + q2[2*ib+1] |= (block_signs[k] << 7*k); + } + LM_GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/4); + continue; + } + + float d = max_scale/31; + y[ibl].d = LM_GGML_FP32_TO_FP16(d); + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + q2[2*ib+1] |= ((uint32_t)l << 28); + const float * xb = xbl + 32*ib; + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib); + const float db = d * (1 + 2*l); + uint32_t u = 0; + for (int k = 0; k < 4; ++k) { + const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127); + const float * xk = xb + 8*k; + const float * wk = weight + 8*k; + const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); + float best_mse = 0; int best_index = aux8[k]; + for (int j = 0; j < 8; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + best_mse += wk[j] * diff * diff; + } + for (int idx = 0; idx < 256; ++idx) { + grid = (const uint8_t *)(kgrid_q2xs + idx); + float mse = 0; + for (int j = 0; j < 8; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + mse += wk[j] * diff * diff; + } + if (mse < best_mse) { + best_mse = mse; best_index = idx; + } + } + u |= (best_index << 8*k); + grid = (const uint8_t *)(kgrid_q2xs + best_index); + //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); + for (int j = 0; j < 8; ++j) { + float q = db * grid[j] * signs[j]; + sumqx += wk[j] * q * xk[j]; + sumq2 += wk[j] * q * q; + } + } + q2[2*ib] = u; + if (sumq2 > 0) y[ibl].d = LM_GGML_FP32_TO_FP16(d*sumqx/sumq2); + } + memcpy(y[ibl].qs, q2, QK_K/4); + } +} + +static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq2_data_index(512); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + LM_GGML_ASSERT(quant_weights); + LM_GGML_ASSERT(kmap_q2xs); + LM_GGML_ASSERT(kgrid_q2xs); + LM_GGML_ASSERT(kneighbors_q2xs); + LM_GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 3; + + const int nbl = n/256; + + block_iq2_xs * y = vy; + + float scales[QK_K/16]; + float weight[16]; + float xval[16]; + int8_t L[16]; + int8_t Laux[16]; + float waux[16]; + bool is_on_grid[2]; + bool is_on_grid_aux[2]; + uint8_t block_signs[2]; + uint16_t q2[2*(QK_K/16)]; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = LM_GGML_FP32_TO_FP16(0.f); + memset(q2, 0, QK_K/4); + memset(y[ibl].scales, 0, QK_K/32); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/16; ++ib) { + const float * xb = xbl + 16*ib; + const float * qw = quant_weights + QK_K*ibl + 16*ib; + for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 2; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 16); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + is_on_grid[0] = is_on_grid[1] = true; + for (int is = -9; is <= 9; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/max; + float this_scale = 1/id; + for (int k = 0; k < 2; ++k) { + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 16; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 16; ++i) L[i] = Laux[i]; + for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 2; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 8; ++i) { + int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 2*i); + L[8*k + i] = l; + } + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 16; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + scale = -scale; + for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 2; ++k) { + uint16_t u = 0; + for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); + printf("\n"); + LM_GGML_ASSERT(false); + } + q2[2*ib+k] = grid_index | (block_signs[k] << 9); + } + LM_GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, QK_K/4); + continue; + } + + float d = max_scale/31; + y[ibl].d = LM_GGML_FP32_TO_FP16(d); + float id = 1/d; + for (int ib = 0; ib < QK_K/16; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + if (ib%2 == 0) y[ibl].scales[ib/2] = l; + else y[ibl].scales[ib/2] |= (l << 4); + } + memcpy(y[ibl].qs, q2, QK_K/4); + + } +} + +size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + LM_GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq2_xxs); + } + return nrow * nblock * sizeof(block_iq2_xxs); +} + +size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + LM_GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq2_xs); + } + return nrow * nblock * sizeof(block_iq2_xs); +} + diff --git a/cpp/ggml-quants.h b/cpp/ggml-quants.h index 73e7a206..20bfc0aa 100644 --- a/cpp/ggml-quants.h +++ b/cpp/ggml-quants.h @@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s // 2-bit quantization // weight is represented as x = a * q + b // 16 blocks of 16 elements each -// Effectively 2.5625 bits per weight +// Effectively 2.625 bits per weight typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants @@ -165,6 +165,22 @@ typedef struct { } block_q8_K; static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); +// (Almost) "true" 2-bit quantization. +// Due to the need to use blocks as per ggml dsign, it ends up using +// 2.0625 bpw because of the 16-bit scale for each block of 256. +typedef struct { + lm_ggml_fp16_t d; + uint16_t qs[QK_K/8]; +} block_iq2_xxs; +static_assert(sizeof(block_iq2_xxs) == sizeof(lm_ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); + +// 2.3125 bpw quants +typedef struct { + lm_ggml_fp16_t d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +static_assert(sizeof(block_iq2_xs) == sizeof(lm_ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); // Quantization void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); @@ -209,6 +225,8 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); // Dot product void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -222,3 +240,16 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict void lm_ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void lm_ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void lm_ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); + +// +// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") +// +size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/cpp/ggml.c b/cpp/ggml.c index dbc6be84..f9fcd4b7 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -132,7 +132,7 @@ void lm_ggml_print_backtrace(void) { "-ex", "bt -frame-info source-and-location", "-ex", "detach", "-ex", "quit", - NULL); + (char *) NULL); } else { waitpid(pid, NULL, 0); } @@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void lm_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void lm_ggml_vec_dot_f16(const int n, float * restrict s, lm_ggml_fp16_t * restrict x, lm_ggml_fp16_t * restrict y); +lm_ggml_collect_imatrix_t g_imatrix_collect = NULL; + +void lm_ggml_set_imatrix_collection(lm_ggml_collect_imatrix_t imatrix_collect) { + g_imatrix_collect = imatrix_collect; +} + static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { [LM_GGML_TYPE_I8] = { .type_name = "i8", @@ -573,6 +579,28 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = { .vec_dot = lm_ggml_vec_dot_q6_K_q8_K, .vec_dot_type = LM_GGML_TYPE_Q8_K, }, + [LM_GGML_TYPE_IQ2_XXS] = { + .type_name = "iq2_xxs", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_xxs), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_iq2_xxs, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = lm_ggml_vec_dot_iq2_xxs_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, + [LM_GGML_TYPE_IQ2_XS] = { + .type_name = "iq2_xs", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_xs), + .is_quantized = true, + .to_float = (lm_ggml_to_float_t) dequantize_row_iq2_xs, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = lm_ggml_vec_dot_iq2_xs_q8_K, + .vec_dot_type = LM_GGML_TYPE_Q8_K, + }, [LM_GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2111,6 +2139,8 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) { case LM_GGML_FTYPE_MOSTLY_Q4_K: wtype = LM_GGML_TYPE_Q4_K; break; case LM_GGML_FTYPE_MOSTLY_Q5_K: wtype = LM_GGML_TYPE_Q5_K; break; case LM_GGML_FTYPE_MOSTLY_Q6_K: wtype = LM_GGML_TYPE_Q6_K; break; + case LM_GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = LM_GGML_TYPE_IQ2_XXS; break; + case LM_GGML_FTYPE_MOSTLY_IQ2_XS: wtype = LM_GGML_TYPE_IQ2_XS; break; case LM_GGML_FTYPE_UNKNOWN: wtype = LM_GGML_TYPE_COUNT; break; case LM_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = LM_GGML_TYPE_COUNT; break; } @@ -2324,6 +2354,10 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) { } void lm_ggml_free(struct lm_ggml_context * ctx) { + if (ctx == NULL) { + return; + } + // make this function thread safe lm_ggml_critical_section_start(); @@ -2383,20 +2417,8 @@ size_t lm_ggml_get_mem_size(const struct lm_ggml_context * ctx) { size_t lm_ggml_get_max_tensor_size(const struct lm_ggml_context * ctx) { size_t max_size = 0; - struct lm_ggml_object * obj = ctx->objects_begin; - - while (obj != NULL) { - if (obj->type == LM_GGML_OBJECT_TENSOR) { - struct lm_ggml_tensor * tensor = (struct lm_ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - - const size_t size = lm_ggml_nbytes(tensor); - - if (max_size < size) { - max_size = size; - } - } - - obj = obj->next; + for (struct lm_ggml_tensor * tensor = lm_ggml_get_first_tensor(ctx); tensor != NULL; tensor = lm_ggml_get_next_tensor(ctx, tensor)) { + max_size = MAX(max_size, lm_ggml_nbytes(tensor)); } return max_size; @@ -3093,7 +3115,7 @@ struct lm_ggml_tensor * lm_ggml_view_tensor( return result; } -struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx) { +struct lm_ggml_tensor * lm_ggml_get_first_tensor(const struct lm_ggml_context * ctx) { struct lm_ggml_object * obj = ctx->objects_begin; char * const mem_buffer = ctx->mem_buffer; @@ -3109,7 +3131,7 @@ struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx) { return NULL; } -struct lm_ggml_tensor * lm_ggml_get_next_tensor(struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor) { +struct lm_ggml_tensor * lm_ggml_get_next_tensor(const struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor) { struct lm_ggml_object * obj = (struct lm_ggml_object *) ((char *)tensor - LM_GGML_OBJECT_SIZE); obj = obj->next; @@ -4053,7 +4075,6 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl( result->op = LM_GGML_OP_GROUP_NORM; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -4183,23 +4204,23 @@ struct lm_ggml_tensor * lm_ggml_out_prod( static struct lm_ggml_tensor * lm_ggml_scale_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b, + float s, bool inplace) { - LM_GGML_ASSERT(lm_ggml_is_scalar(b)); LM_GGML_ASSERT(lm_ggml_is_padded_1d(a)); bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { is_node = true; } struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); + lm_ggml_set_op_params(result, &s, sizeof(s)); + result->op = LM_GGML_OP_SCALE; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -4207,15 +4228,15 @@ static struct lm_ggml_tensor * lm_ggml_scale_impl( struct lm_ggml_tensor * lm_ggml_scale( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b) { - return lm_ggml_scale_impl(ctx, a, b, false); + float s) { + return lm_ggml_scale_impl(ctx, a, s, false); } struct lm_ggml_tensor * lm_ggml_scale_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b) { - return lm_ggml_scale_impl(ctx, a, b, true); + float s) { + return lm_ggml_scale_impl(ctx, a, s, true); } // lm_ggml_set @@ -4312,13 +4333,13 @@ struct lm_ggml_tensor * lm_ggml_set_2d_inplace( static struct lm_ggml_tensor * lm_ggml_cpy_impl( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b, - bool inplace) { + struct lm_ggml_tensor * b) { LM_GGML_ASSERT(lm_ggml_nelements(a) == lm_ggml_nelements(b)); bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { + // inplace is false and either one have a grad is_node = true; } @@ -4342,29 +4363,38 @@ struct lm_ggml_tensor * lm_ggml_cpy( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b) { - return lm_ggml_cpy_impl(ctx, a, b, false); + return lm_ggml_cpy_impl(ctx, a, b); } -struct lm_ggml_tensor * lm_ggml_cpy_inplace( +struct lm_ggml_tensor * lm_ggml_cast( struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b) { - return lm_ggml_cpy_impl(ctx, a, b, true); + struct lm_ggml_tensor * a, + enum lm_ggml_type type) { + bool is_node = false; + + struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, type, LM_GGML_MAX_DIMS, a->ne); + lm_ggml_format_name(result, "%s (copy)", a->name); + + result->op = LM_GGML_OP_CPY; + result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = result; + + return result; } // lm_ggml_cont static struct lm_ggml_tensor * lm_ggml_cont_impl( struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a, - bool inplace) { + struct lm_ggml_tensor * a) { bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } - struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a); + struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a); lm_ggml_format_name(result, "%s (cont)", a->name); result->op = LM_GGML_OP_CONT; @@ -4377,13 +4407,7 @@ static struct lm_ggml_tensor * lm_ggml_cont_impl( struct lm_ggml_tensor * lm_ggml_cont( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a) { - return lm_ggml_cont_impl(ctx, a, false); -} - -struct lm_ggml_tensor * lm_ggml_cont_inplace( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a) { - return lm_ggml_cont_impl(ctx, a, true); + return lm_ggml_cont_impl(ctx, a); } // make contiguous, with new shape @@ -4779,8 +4803,11 @@ struct lm_ggml_tensor * lm_ggml_get_rows( } // TODO: implement non F32 return - //struct lm_ggml_tensor * result = lm_ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); + enum lm_ggml_type type = LM_GGML_TYPE_F32; + if (a->type == LM_GGML_TYPE_I32) { + type = a->type; + } + struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]); result->op = LM_GGML_OP_GET_ROWS; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; @@ -5553,7 +5580,6 @@ static struct lm_ggml_tensor * lm_ggml_upscale_impl( result->op_params[0] = scale_factor; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5858,7 +5884,6 @@ struct lm_ggml_tensor * lm_ggml_get_rel_pos( result->op = LM_GGML_OP_GET_REL_POS; result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6953,14 +6978,165 @@ static void lm_ggml_compute_forward_dup_f32( } } -static void lm_ggml_compute_forward_dup( +// A simplified version of lm_ggml_compute_forward_dup that doesn't do float upcasting, and just plain old memcpy. +static void lm_ggml_compute_forward_dup_bytes( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, struct lm_ggml_tensor * dst) { - if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst) && src0->type == dst->type) { + LM_GGML_ASSERT(lm_ggml_nelements(dst) == lm_ggml_nelements(src0)); + LM_GGML_ASSERT(src0->type == dst->type); + + if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { + return; + } + + if (lm_ggml_is_contiguous(src0) && lm_ggml_is_contiguous(dst)) { lm_ggml_compute_forward_dup_same_cont(params, src0, dst); return; } + + LM_GGML_TENSOR_UNARY_OP_LOCALS; + + const size_t type_size = lm_ggml_type_size(src0->type); + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == type_size && nb0 == type_size) { + // copy by rows + const size_t rs = ne00 * type_size; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + if (lm_ggml_is_contiguous(dst)) { + size_t id = 0; + char * dst_ptr = (char *) dst->data; + const size_t rs = ne00 * type_size; + + if (nb00 == type_size) { + // src0 is contigous on first dimension, copy by rows + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = (char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, type_size); + + id += type_size; + } + } + id += rs * (ne01 - ir1); + } + } + } + + return; + } + + // dst counters + + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, type_size); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } +} + +static void lm_ggml_compute_forward_dup( + const struct lm_ggml_compute_params * params, + const struct lm_ggml_tensor * src0, + struct lm_ggml_tensor * dst) { + if (src0->type == dst->type) { + lm_ggml_compute_forward_dup_bytes(params, src0, dst); + return; + } + switch (src0->type) { case LM_GGML_TYPE_F16: { @@ -7297,6 +7473,8 @@ static void lm_ggml_compute_forward_add( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: { lm_ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7561,6 +7739,8 @@ static void lm_ggml_compute_forward_add1( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: { lm_ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -7675,6 +7855,8 @@ static void lm_ggml_compute_forward_acc( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: default: { LM_GGML_ASSERT(false); @@ -8419,10 +8601,12 @@ static void lm_ggml_compute_forward_repeat( struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F16: + case LM_GGML_TYPE_I16: { lm_ggml_compute_forward_repeat_f16(params, src0, dst); } break; case LM_GGML_TYPE_F32: + case LM_GGML_TYPE_I32: { lm_ggml_compute_forward_repeat_f32(params, src0, dst); } break; @@ -8565,6 +8749,7 @@ static void lm_ggml_compute_forward_concat( struct lm_ggml_tensor* dst) { switch (src0->type) { case LM_GGML_TYPE_F32: + case LM_GGML_TYPE_I32: { lm_ggml_compute_forward_concat_f32(params, src0, src1, dst); } break; @@ -9562,10 +9747,10 @@ static void lm_ggml_compute_forward_group_norm( #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster -static bool lm_ggml_compute_forward_mul_mat_use_blas( - const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, - struct lm_ggml_tensor * dst) { +static bool lm_ggml_compute_forward_mul_mat_use_blas(struct lm_ggml_tensor * dst) { + const struct lm_ggml_tensor * src0 = dst->src[0]; + const struct lm_ggml_tensor * src1 = dst->src[1]; + //const int64_t ne00 = src0->ne[0]; //const int64_t ne01 = src0->ne[1]; @@ -9605,6 +9790,10 @@ static void lm_ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0, src1); + } + const enum lm_ggml_type type = src0->type; const bool src1_cont = lm_ggml_is_contiguous(src1); @@ -9645,7 +9834,7 @@ static void lm_ggml_compute_forward_mul_mat( #endif #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) - if (lm_ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (lm_ggml_compute_forward_mul_mat_use_blas(dst)) { if (params->ith != 0) { return; } @@ -9702,7 +9891,7 @@ static void lm_ggml_compute_forward_mul_mat( const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); assert(params->wsize >= ne11*ne12*ne13*row_size); - assert(src1->type == LM_GGML_TYPE_F32); + LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -9908,6 +10097,10 @@ static void lm_ggml_compute_forward_mul_mat_id( const struct lm_ggml_tensor * src0_cur = dst->src[cur_a + 2]; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0_cur, src1); + } + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = lm_ggml_row_size(vec_dot_type, ne10); @@ -10313,6 +10506,8 @@ static void lm_ggml_compute_forward_out_prod( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: { lm_ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -10337,19 +10532,18 @@ static void lm_ggml_compute_forward_out_prod( static void lm_ggml_compute_forward_scale_f32( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { LM_GGML_ASSERT(lm_ggml_is_contiguous(src0)); LM_GGML_ASSERT(lm_ggml_is_contiguous(dst)); LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, dst)); - LM_GGML_ASSERT(lm_ggml_is_scalar(src1)); if (params->type == LM_GGML_TASK_INIT || params->type == LM_GGML_TASK_FINALIZE) { return; } // scale factor - const float v = *(float *) src1->data; + float v; + memcpy(&v, dst->op_params, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -10380,12 +10574,11 @@ static void lm_ggml_compute_forward_scale_f32( static void lm_ggml_compute_forward_scale( const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0, - const struct lm_ggml_tensor * src1, struct lm_ggml_tensor * dst) { switch (src0->type) { case LM_GGML_TYPE_F32: { - lm_ggml_compute_forward_scale_f32(params, src0, src1, dst); + lm_ggml_compute_forward_scale_f32(params, src0, dst); } break; default: { @@ -10489,6 +10682,8 @@ static void lm_ggml_compute_forward_set( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: default: { LM_GGML_ASSERT(false); @@ -10683,6 +10878,8 @@ static void lm_ggml_compute_forward_get_rows( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: { lm_ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -10691,6 +10888,7 @@ static void lm_ggml_compute_forward_get_rows( lm_ggml_compute_forward_get_rows_f16(params, src0, src1, dst); } break; case LM_GGML_TYPE_F32: + case LM_GGML_TYPE_I32: { lm_ggml_compute_forward_get_rows_f32(params, src0, src1, dst); } break; @@ -11318,6 +11516,8 @@ static void lm_ggml_compute_forward_alibi( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: case LM_GGML_TYPE_Q8_K: case LM_GGML_TYPE_I8: case LM_GGML_TYPE_I16: @@ -11392,6 +11592,8 @@ static void lm_ggml_compute_forward_clamp( case LM_GGML_TYPE_Q4_K: case LM_GGML_TYPE_Q5_K: case LM_GGML_TYPE_Q6_K: + case LM_GGML_TYPE_IQ2_XXS: + case LM_GGML_TYPE_IQ2_XS: case LM_GGML_TYPE_Q8_K: case LM_GGML_TYPE_I8: case LM_GGML_TYPE_I16: @@ -11436,6 +11638,21 @@ static float lm_ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); } +static void lm_ggml_rope_cache_init( + float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, + float * cache, float sin_sign, float theta_scale +) { + float theta = theta_base; + for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + rope_yarn( + theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1] + ); + cache[i0 + 1] *= sin_sign; + + theta *= theta_scale; + } +} + void lm_ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { @@ -11518,6 +11735,12 @@ static void lm_ggml_compute_forward_rope_f32( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox + lm_ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + } + for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -11551,18 +11774,13 @@ static void lm_ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; // zeta scaling for xPos only: float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; if (xpos_down) zeta = 1.0f / zeta; - theta_base *= theta_scale; - const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -11686,6 +11904,12 @@ static void lm_ggml_compute_forward_rope_f16( for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { const int64_t p = pos[i2]; + + float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith; + if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox + lm_ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale); + } + for (int64_t i1 = 0; i1 < ne1; i1++) { if (ir++ < ir0) continue; if (ir > ir1) break; @@ -11719,13 +11943,8 @@ static void lm_ggml_compute_forward_rope_f16( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - float cos_theta, sin_theta; - rope_yarn( - theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta - ); - sin_theta *= sin_sign; - - theta_base *= theta_scale; + const float cos_theta = cache[i0 + 0]; + const float sin_theta = cache[i0 + 1]; const lm_ggml_fp16_t * const src = (lm_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); lm_ggml_fp16_t * dst_data = (lm_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -14395,7 +14614,7 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru } break; case LM_GGML_OP_SCALE: { - lm_ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); + lm_ggml_compute_forward_scale(params, tensor->src[0], tensor); } break; case LM_GGML_OP_SET: { @@ -14690,7 +14909,7 @@ size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set hash_set, struct lm_g return i; } -static struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size) { +struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size) { size = lm_ggml_hash_size(size); struct lm_ggml_hash_set result; result.size = size; @@ -14851,7 +15070,7 @@ static struct lm_ggml_tensor * lm_ggml_add_or_set(struct lm_ggml_context * ctx, static struct lm_ggml_tensor * lm_ggml_acc_or_set(struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, struct lm_ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct lm_ggml_hash_set zero_table) { if (lm_ggml_hash_contains(zero_table, a)) { - struct lm_ggml_tensor * a_zero = lm_ggml_scale(ctx, a, lm_ggml_new_f32(ctx, 0)); + struct lm_ggml_tensor * a_zero = lm_ggml_scale(ctx, a, 0.0f); return lm_ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); } else { return lm_ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); @@ -14987,7 +15206,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm src0->grad, lm_ggml_scale(ctx, lm_ggml_mul(ctx, src0, tensor->grad), - lm_ggml_new_f32(ctx, 2.0f)), + 2.0f), zero_table); } } break; @@ -15001,7 +15220,7 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm lm_ggml_div(ctx, tensor->grad, tensor), - lm_ggml_new_f32(ctx, 0.5f)), + 0.5f), zero_table); } } break; @@ -15167,17 +15386,13 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm { // necessary for llama if (src0->grad) { + float s; + memcpy(&s, tensor->op_params, sizeof(float)); + src0->grad = lm_ggml_add_or_set(ctx, src0->grad, - lm_ggml_scale_impl(ctx, tensor->grad, src1, false), - zero_table); - } - if (src1->grad) { - src1->grad = - lm_ggml_add_or_set(ctx, - src1->grad, - lm_ggml_sum(ctx, lm_ggml_mul_impl(ctx, tensor->grad, src0, false)), + lm_ggml_scale_impl(ctx, tensor->grad, s, false), zero_table); } } break; @@ -15355,6 +15570,8 @@ static void lm_ggml_compute_backward(struct lm_ggml_context * ctx, struct lm_ggm const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = lm_ggml_add_or_set(ctx, src0->grad, + /* lm_ggml_diag_mask_inf_impl() shouldn't be here */ + /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ lm_ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), zero_table); } @@ -16162,24 +16379,6 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) { //n_tasks = MIN(n_threads, MAX(1, nr0/128)); //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); - -#if defined(LM_GGML_USE_CUBLAS) - if (lm_ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#elif defined(LM_GGML_USE_CLBLAST) - if (lm_ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif -#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) - if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } -#endif } break; case LM_GGML_OP_MUL_MAT_ID: { @@ -16352,6 +16551,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { state->shared->node_n += 1; return (thread_ret_t) LM_GGML_EXIT_ABORTED; } + if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again @@ -16417,14 +16617,18 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { } else { // wait for other threads to finish const int last = node_n; + + const bool do_yield = last < 0 || cgraph->nodes[last]->op == LM_GGML_OP_MUL_MAT; + while (true) { // TODO: this sched_yield can have significant impact on the performance - either positive or negative // depending on the workload and the operating system. // since it is not clear what is the best approach, it should potentially become user-configurable // ref: https://github.com/ggerganov/ggml/issues/291 -#if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) - sched_yield(); -#endif + // UPD: adding the do_yield flag seems to resolve the issue universally + if (do_yield) { + sched_yield(); + } node_n = atomic_load(&state->shared->node_n); if (node_n != last) break; @@ -16454,7 +16658,7 @@ static thread_ret_t lm_ggml_graph_compute_thread(void * data) { return LM_GGML_EXIT_SUCCESS; } -struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_threads) { +struct lm_ggml_cplan lm_ggml_graph_plan(const struct lm_ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = LM_GGML_DEFAULT_N_THREADS; } @@ -16503,7 +16707,7 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th } else #endif #if defined(LM_GGML_USE_ACCELERATE) || defined(LM_GGML_USE_OPENBLAS) - if (lm_ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + if (lm_ggml_compute_forward_mul_mat_use_blas(node)) { if (node->src[0]->type != LM_GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 cur = lm_ggml_type_size(LM_GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); @@ -16516,14 +16720,15 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th } break; case LM_GGML_OP_MUL_MAT_ID: { + cur = 0; const struct lm_ggml_tensor * src0 = node->src[2]; const struct lm_ggml_tensor * src1 = node->src[1]; const enum lm_ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur = lm_ggml_row_size(vec_dot_type, lm_ggml_nelements(src1)); + cur += lm_ggml_row_size(vec_dot_type, lm_ggml_nelements(src1)); } const int n_as = lm_ggml_get_op_params_i32(node, 1); - cur = LM_GGML_PAD(cur, sizeof(int64_t)); // align + cur += LM_GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; @@ -16534,6 +16739,7 @@ struct lm_ggml_cplan lm_ggml_graph_plan(struct lm_ggml_cgraph * cgraph, int n_th } } break; case LM_GGML_OP_SOFT_MAX: + case LM_GGML_OP_ROPE: { cur = lm_ggml_type_size(LM_GGML_TYPE_F32) * node->ne[0] * n_tasks; } break; @@ -17472,9 +17678,9 @@ static void lm_ggml_opt_acc_grad(int np, struct lm_ggml_tensor * const ps[], flo } // -// ADAM +// Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf // -// ref: https://arxiv.org/pdf/1412.6980.pdf +// (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf) // static enum lm_ggml_opt_result lm_ggml_opt_adam( @@ -18459,8 +18665,11 @@ size_t lm_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_ return (n/QK8_0*sizeof(block_q8_0)); } -size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, int start, + int nrows, int n_per_row, int64_t * hist, const float * imatrix) { + (void)imatrix; size_t result = 0; + int n = nrows * n_per_row; switch (type) { case LM_GGML_TYPE_Q4_0: { @@ -18495,32 +18704,67 @@ size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * case LM_GGML_TYPE_Q2_K: { LM_GGML_ASSERT(start % QK_K == 0); - block_q2_K * block = (block_q2_K*)dst + start / QK_K; - result = lm_ggml_quantize_q2_K(src + start, block, n, n, hist); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); } break; case LM_GGML_TYPE_Q3_K: { LM_GGML_ASSERT(start % QK_K == 0); - block_q3_K * block = (block_q3_K*)dst + start / QK_K; - result = lm_ggml_quantize_q3_K(src + start, block, n, n, hist); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); } break; case LM_GGML_TYPE_Q4_K: { LM_GGML_ASSERT(start % QK_K == 0); - block_q4_K * block = (block_q4_K*)dst + start / QK_K; - result = lm_ggml_quantize_q4_K(src + start, block, n, n, hist); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); } break; case LM_GGML_TYPE_Q5_K: { LM_GGML_ASSERT(start % QK_K == 0); - block_q5_K * block = (block_q5_K*)dst + start / QK_K; - result = lm_ggml_quantize_q5_K(src + start, block, n, n, hist); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); } break; case LM_GGML_TYPE_Q6_K: { LM_GGML_ASSERT(start % QK_K == 0); - block_q6_K * block = (block_q6_K*)dst + start / QK_K; - result = lm_ggml_quantize_q6_K(src + start, block, n, n, hist); + LM_GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); + } break; + case LM_GGML_TYPE_IQ2_XXS: + { + LM_GGML_ASSERT(start % QK_K == 0); + LM_GGML_ASSERT(start % n_per_row == 0); + LM_GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); + } break; + case LM_GGML_TYPE_IQ2_XS: + { + LM_GGML_ASSERT(start % QK_K == 0); + LM_GGML_ASSERT(start % n_per_row == 0); + LM_GGML_ASSERT(imatrix); + size_t start_row = start / n_per_row; + size_t row_size = lm_ggml_row_size(type, n_per_row); + result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + LM_GGML_ASSERT(result == row_size * nrows); } break; case LM_GGML_TYPE_F16: { @@ -18877,8 +19121,8 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg (int64_t) info->ne[3]; if (ne % lm_ggml_blck_size(info->type) != 0) { - fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", - __func__, info->name.data, ne, lm_ggml_blck_size(info->type)); + fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, (int)info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type)); fclose(file); lm_gguf_free(ctx); return NULL; @@ -18984,7 +19228,7 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { if (ctx->kv) { // free string memory - not great.. - for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct lm_gguf_kv * kv = &ctx->kv[i]; if (kv->key.data) { @@ -19000,7 +19244,7 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { if (kv->type == LM_GGUF_TYPE_ARRAY) { if (kv->value.arr.data) { if (kv->value.arr.type == LM_GGUF_TYPE_STRING) { - for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { struct lm_gguf_str * str = &((struct lm_gguf_str *) kv->value.arr.data)[j]; if (str->data) { free(str->data); @@ -19016,7 +19260,7 @@ void lm_gguf_free(struct lm_gguf_context * ctx) { } if (ctx->infos) { - for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct lm_gguf_tensor_info * info = &ctx->infos[i]; if (info->name.data) { @@ -19213,6 +19457,10 @@ char * lm_gguf_get_tensor_name(const struct lm_gguf_context * ctx, int i) { return ctx->infos[i].name.data; } +enum lm_ggml_type lm_gguf_get_tensor_type(const struct lm_gguf_context * ctx, int i) { + return ctx->infos[i].type; +} + // returns the index static int lm_gguf_get_or_add_key(struct lm_gguf_context * ctx, const char * key) { const int idx = lm_gguf_find_key(ctx, key); @@ -19363,7 +19611,7 @@ void lm_gguf_set_kv(struct lm_gguf_context * ctx, struct lm_gguf_context * src) data[j] = ((struct lm_gguf_str *)src->kv[i].value.arr.data)[j].data; } lm_gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); - free(data); + free((void *)data); } else if (src->kv[i].value.arr.type == LM_GGUF_TYPE_ARRAY) { LM_GGML_ASSERT(false && "nested arrays not supported"); } else { @@ -19653,6 +19901,14 @@ int lm_ggml_cpu_has_avx(void) { #endif } +int lm_ggml_cpu_has_avx_vnni(void) { +#if defined(__AVXVNNI__) + return 1; +#else + return 0; +#endif +} + int lm_ggml_cpu_has_avx2(void) { #if defined(__AVX2__) return 1; diff --git a/cpp/ggml.h b/cpp/ggml.h index e9f15cb1..a2d1a140 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -218,7 +218,9 @@ #define LM_GGML_MAX_PARAMS 2048 #define LM_GGML_MAX_CONTEXTS 64 #define LM_GGML_MAX_SRC 10 +#ifndef LM_GGML_MAX_NAME #define LM_GGML_MAX_NAME 64 +#endif #define LM_GGML_MAX_OP_PARAMS 64 #define LM_GGML_DEFAULT_N_THREADS 4 #define LM_GGML_DEFAULT_GRAPH_SIZE 2048 @@ -255,6 +257,8 @@ #define LM_GGML_UNREACHABLE() LM_GGML_ASSERT(!"statement should not be reached") #elif defined(__GNUC__) #define LM_GGML_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define LM_GGML_UNREACHABLE() __assume(0) #else #define LM_GGML_UNREACHABLE() ((void) 0) #endif @@ -303,7 +307,7 @@ extern "C" { #if defined(__ARM_NEON) && defined(__CUDACC__) typedef half lm_ggml_fp16_t; -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) && !defined(_MSC_VER) typedef __fp16 lm_ggml_fp16_t; #else typedef uint16_t lm_ggml_fp16_t; @@ -337,6 +341,8 @@ extern "C" { LM_GGML_TYPE_Q5_K = 13, LM_GGML_TYPE_Q6_K = 14, LM_GGML_TYPE_Q8_K = 15, + LM_GGML_TYPE_IQ2_XXS = 16, + LM_GGML_TYPE_IQ2_XS = 17, LM_GGML_TYPE_I8, LM_GGML_TYPE_I16, LM_GGML_TYPE_I32, @@ -371,6 +377,8 @@ extern "C" { LM_GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors LM_GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors LM_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors + LM_GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors }; // available tensor operations: @@ -484,7 +492,8 @@ extern "C" { enum lm_ggml_log_level { LM_GGML_LOG_LEVEL_ERROR = 2, LM_GGML_LOG_LEVEL_WARN = 3, - LM_GGML_LOG_LEVEL_INFO = 4 + LM_GGML_LOG_LEVEL_INFO = 4, + LM_GGML_LOG_LEVEL_DEBUG = 5 }; // ggml object @@ -735,8 +744,8 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_view_tensor(struct lm_ggml_context * ctx, struct lm_ggml_tensor * src); // Context tensor enumeration and lookup - LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_first_tensor(struct lm_ggml_context * ctx); - LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_next_tensor (struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_first_tensor(const struct lm_ggml_context * ctx); + LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_next_tensor (const struct lm_ggml_context * ctx, struct lm_ggml_tensor * tensor); LM_GGML_API struct lm_ggml_tensor * lm_ggml_get_tensor(struct lm_ggml_context * ctx, const char * name); LM_GGML_API struct lm_ggml_tensor * lm_ggml_set_zero(struct lm_ggml_tensor * tensor); @@ -1094,13 +1103,13 @@ extern "C" { LM_GGML_API struct lm_ggml_tensor * lm_ggml_scale( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b); + float s); // in-place, returns view(a) LM_GGML_API struct lm_ggml_tensor * lm_ggml_scale_inplace( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b); + float s); // b -> view(a,offset,nb1,nb2,3), return modified a LM_GGML_API struct lm_ggml_tensor * lm_ggml_set( @@ -1156,22 +1165,16 @@ extern "C" { struct lm_ggml_tensor * a, struct lm_ggml_tensor * b); - // a -> b, in-place, return view(b) - LM_GGML_API struct lm_ggml_tensor * lm_ggml_cpy_inplace( + LM_GGML_API struct lm_ggml_tensor * lm_ggml_cast( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a, - struct lm_ggml_tensor * b); + enum lm_ggml_type type); // make contiguous LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont( struct lm_ggml_context * ctx, struct lm_ggml_tensor * a); - // make contiguous, in-place - LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_inplace( - struct lm_ggml_context * ctx, - struct lm_ggml_tensor * a); - // make contiguous, with new shape LM_GGML_API struct lm_ggml_tensor * lm_ggml_cont_1d( struct lm_ggml_context * ctx, @@ -1844,8 +1847,8 @@ extern "C" { // lm_ggml_graph_plan() has to be called before lm_ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan (struct lm_ggml_cgraph * cgraph, int n_threads /*= LM_GGML_DEFAULT_N_THREADS*/); - LM_GGML_API int lm_ggml_graph_compute(struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan); + LM_GGML_API struct lm_ggml_cplan lm_ggml_graph_plan (const struct lm_ggml_cgraph * cgraph, int n_threads /*= LM_GGML_DEFAULT_N_THREADS*/); + LM_GGML_API int lm_ggml_graph_compute( struct lm_ggml_cgraph * cgraph, struct lm_ggml_cplan * cplan); // same as lm_ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data @@ -2065,7 +2068,18 @@ extern "C" { LM_GGML_API size_t lm_ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); LM_GGML_API size_t lm_ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - LM_GGML_API size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + LM_GGML_API size_t lm_ggml_quantize_chunk(enum lm_ggml_type type, const float * src, void * dst, + int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix); + + // These are needed for IQ2_XS and IQ2_XXS quantizations + LM_GGML_API void lm_ggml_init_iq2_quantization(enum lm_ggml_type type); + LM_GGML_API void lm_ggml_deinit_iq2_quantization(enum lm_ggml_type type); + + // + // Importance matrix + // + typedef void(*lm_ggml_collect_imatrix_t)(const struct lm_ggml_tensor * src0, const struct lm_ggml_tensor * src1); + LM_GGML_API void lm_ggml_set_imatrix_collection(lm_ggml_collect_imatrix_t imatrix_collect); // // gguf @@ -2135,10 +2149,11 @@ extern "C" { LM_GGML_API const void * lm_gguf_get_arr_data(const struct lm_gguf_context * ctx, int key_id); LM_GGML_API const char * lm_gguf_get_arr_str (const struct lm_gguf_context * ctx, int key_id, int i); - LM_GGML_API int lm_gguf_get_n_tensors (const struct lm_gguf_context * ctx); - LM_GGML_API int lm_gguf_find_tensor (const struct lm_gguf_context * ctx, const char * name); - LM_GGML_API size_t lm_gguf_get_tensor_offset(const struct lm_gguf_context * ctx, int i); - LM_GGML_API char * lm_gguf_get_tensor_name (const struct lm_gguf_context * ctx, int i); + LM_GGML_API int lm_gguf_get_n_tensors (const struct lm_gguf_context * ctx); + LM_GGML_API int lm_gguf_find_tensor (const struct lm_gguf_context * ctx, const char * name); + LM_GGML_API size_t lm_gguf_get_tensor_offset(const struct lm_gguf_context * ctx, int i); + LM_GGML_API char * lm_gguf_get_tensor_name (const struct lm_gguf_context * ctx, int i); + LM_GGML_API enum lm_ggml_type lm_gguf_get_tensor_type (const struct lm_gguf_context * ctx, int i); // overrides existing values or adds a new one LM_GGML_API void lm_gguf_set_val_u8 (struct lm_gguf_context * ctx, const char * key, uint8_t val); @@ -2194,6 +2209,7 @@ extern "C" { // LM_GGML_API int lm_ggml_cpu_has_avx (void); + LM_GGML_API int lm_ggml_cpu_has_avx_vnni (void); LM_GGML_API int lm_ggml_cpu_has_avx2 (void); LM_GGML_API int lm_ggml_cpu_has_avx512 (void); LM_GGML_API int lm_ggml_cpu_has_avx512_vbmi(void); diff --git a/cpp/llama.cpp b/cpp/llama.cpp index db1151fb..49db6f9e 100644 --- a/cpp/llama.cpp +++ b/cpp/llama.cpp @@ -4,8 +4,8 @@ #include "unicode.h" #include "ggml.h" - #include "ggml-alloc.h" +#include "ggml-backend.h" #ifdef LM_GGML_USE_CUBLAS # include "ggml-cuda.h" @@ -32,6 +32,7 @@ #include #if defined(_POSIX_MAPPED_FILES) #include + #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -161,10 +162,6 @@ static bool is_float_close(float a, float b, float abs_tol) { return std::fabs(b - a) <= abs_tol; } -#ifdef LM_GGML_USE_CPU_HBM -#include -#endif - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -207,6 +204,7 @@ enum llm_arch { LLM_ARCH_STABLELM, LLM_ARCH_QWEN, LLM_ARCH_PHI2, + LLM_ARCH_PLAMO, LLM_ARCH_UNKNOWN, }; @@ -225,6 +223,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PLAMO, "plamo" }, }; enum llm_kv { @@ -252,6 +251,8 @@ enum llm_kv { LLM_KV_ATTENTION_HEAD_COUNT_KV, LLM_KV_ATTENTION_MAX_ALIBI_BIAS, LLM_KV_ATTENTION_CLAMP_KQV, + LLM_KV_ATTENTION_KEY_LENGTH, + LLM_KV_ATTENTION_VALUE_LENGTH, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, @@ -304,6 +305,8 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, + { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, + { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, @@ -361,6 +364,7 @@ enum llm_tensor { LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_ACT, LLM_TENSOR_FFN_DOWN_EXP, LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, @@ -429,6 +433,15 @@ static std::map> LLM_TENSOR_NAMES = LLM_ARCH_GPT2, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, }, }, { @@ -480,6 +493,7 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, }, }, { @@ -571,7 +585,28 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, + { + LLM_ARCH_PLAMO, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, @@ -723,38 +758,6 @@ static void lm_ggml_graph_compute_helper(std::vector & buf, lm_ggml_cgr // llama helpers // -inline void * llama_host_malloc(size_t n) { -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - return lm_ggml_cuda_host_malloc(n); - } else { - return malloc(n); - } -#elif LM_GGML_USE_METAL - return lm_ggml_metal_host_malloc(n); -#elif LM_GGML_USE_CPU_HBM - return hbw_malloc(n); -#else - return malloc(n); -#endif -} - -inline void llama_host_free(void * ptr) { -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - return lm_ggml_cuda_host_free(ptr); - } else { - return free(ptr); - } -#elif LM_GGML_USE_METAL - return lm_ggml_metal_host_free(ptr); -#elif LM_GGML_USE_CPU_HBM - return hbw_free(ptr); -#else - return free(ptr); -#endif -} - #if defined(_WIN32) static std::string llama_format_win_err(DWORD err) { LPSTR buf; @@ -769,40 +772,10 @@ static std::string llama_format_win_err(DWORD err) { } #endif -struct llama_buffer { - void * data = NULL; - size_t size = 0; - - // fallback to malloc / free - // useful in cases where CUDA can try to allocate PINNED memory - bool fallback = false; - - void resize(size_t n) { - llama_host_free(data); - - data = llama_host_malloc(n); - if (!data) { - fallback = true; - data = malloc(n); - } else { - fallback = false; - } - - LM_GGML_ASSERT(data); - size = n; - } - - ~llama_buffer() { - if (data) { - if (fallback) { // NOLINT - free(data); - } else { - llama_host_free(data); - } - } - - data = NULL; - } +template +struct no_init { + T value; + no_init() { /* do nothing */ } }; struct llama_file { @@ -849,7 +822,7 @@ struct llama_file { throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret != 1) { - throw std::runtime_error(std::string("unexpectedly reached end of file")); + throw std::runtime_error("unexpectedly reached end of file"); } } @@ -890,6 +863,9 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; + // list of mapped fragments (first_offset, last_offset) + std::vector> mapped_fragments; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { size = file->size; int fd = fileno(file->fp); @@ -897,15 +873,20 @@ struct llama_mmap { // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ + // advise the kernel to read the file sequentially (increases readahead) + if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { + LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n", + strerror(errno)); + } if (prefetch) { flags |= MAP_POPULATE; } #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); - if (addr == MAP_FAILED) { + if (addr == MAP_FAILED) { // NOLINT throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } if (prefetch > 0) { - // Advise the kernel to preload the mapped memory + // advise the kernel to preload the mapped memory if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno)); @@ -919,37 +900,105 @@ struct llama_mmap { strerror(errno)); } } + + // initialize list of mapped_fragments + mapped_fragments.emplace_back(0, file->size); + } + + static void align_range(size_t * first, size_t * last, size_t page_size) { + // align first to the next page + size_t offset_in_page = *first & (page_size - 1); + size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page; + *first += offset_to_page; + + // align last to the previous page + *last = *last & ~(page_size - 1); + + if (*last <= *first) { + *last = *first; + } + } + + // partially unmap the file in the range [first, last) + void unmap_fragment(size_t first, size_t last) { + // note: this function must not be called multiple times with overlapping ranges + // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings + int page_size = sysconf(_SC_PAGESIZE); + align_range(&first, &last, page_size); + size_t len = last - first; + + if (len == 0) { + return; + } + + LM_GGML_ASSERT(first % page_size == 0); + LM_GGML_ASSERT(last % page_size == 0); + LM_GGML_ASSERT(last > first); + + void * next_page_start = (uint8_t *) addr + first; + + // unmap the range + if (munmap(next_page_start, len)) { + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + + // update the list of mapped fragments to avoid unmapping the same range again in the destructor + std::vector> new_mapped_fragments; + for (const auto & frag : mapped_fragments) { + if (frag.first < first && frag.second > last) { + // the range is in the middle of the fragment, split it + new_mapped_fragments.emplace_back(frag.first, first); + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first < first && frag.second > first) { + // the range starts in the middle of the fragment + new_mapped_fragments.emplace_back(frag.first, first); + } else if (frag.first < last && frag.second > last) { + // the range ends in the middle of the fragment + new_mapped_fragments.emplace_back(last, frag.second); + } else if (frag.first >= first && frag.second <= last) { + // the range covers the entire fragment + } else { + // the range is outside the fragment + new_mapped_fragments.push_back(frag); + } + } + mapped_fragments = std::move(new_mapped_fragments); } ~llama_mmap() { - munmap(addr, size); + for (const auto & frag : mapped_fragments) { + if (munmap((char *) addr + frag.first, frag.second - frag.first)) { + LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); + } + } } #elif defined(_WIN32) static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) { + LM_GGML_UNUSED(numa); size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); - DWORD error = GetLastError(); if (hMapping == NULL) { + DWORD error = GetLastError(); throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); - error = GetLastError(); + DWORD error = GetLastError(); CloseHandle(hMapping); if (addr == NULL) { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } - if (prefetch) { + if (prefetch > 0) { +#if _WIN32_WINNT >= 0x602 // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); @@ -961,30 +1010,46 @@ struct llama_mmap { // advise the kernel to preload the mapped memory WIN32_MEMORY_RANGE_ENTRY range; range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; + range.NumberOfBytes = (SIZE_T) std::min(size, prefetch); if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } +#else + throw std::runtime_error("PrefetchVirtualMemory unavailable"); +#endif } } + void unmap_fragment(size_t first, size_t last) { + // not supported + LM_GGML_UNUSED(first); + LM_GGML_UNUSED(last); + } + ~llama_mmap() { if (!UnmapViewOfFile(addr)) { - fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", llama_format_win_err(GetLastError()).c_str()); } } #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { - (void) file; - (void) prefetch; - (void) numa; + llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) { + LM_GGML_UNUSED(file); + LM_GGML_UNUSED(prefetch); + LM_GGML_UNUSED(numa); + + throw std::runtime_error("mmap not supported"); + } + + void unmap_fragment(size_t first, size_t last) { + LM_GGML_UNUSED(first); + LM_GGML_UNUSED(last); - throw std::runtime_error(std::string("mmap not supported")); + throw std::runtime_error("mmap not supported"); } #endif }; @@ -1060,7 +1125,7 @@ struct llama_mlock { suggest = false; } - fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); return false; } @@ -1069,7 +1134,7 @@ struct llama_mlock { static void raw_unlock(void * addr, size_t size) { if (munlock(addr, size)) { - fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno)); } } #elif defined(_WIN32) @@ -1087,7 +1152,7 @@ struct llama_mlock { return true; } if (tries == 2) { - fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", len, size, llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1096,7 +1161,7 @@ struct llama_mlock { // set size and try again. SIZE_T min_ws_size, max_ws_size; if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { - fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1109,7 +1174,7 @@ struct llama_mlock { min_ws_size += increment; max_ws_size += increment; if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { - fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n", llama_format_win_err(GetLastError()).c_str()); return false; } @@ -1118,7 +1183,7 @@ struct llama_mlock { static void raw_unlock(void * ptr, size_t len) { if (!VirtualUnlock(ptr, len)) { - fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n", llama_format_win_err(GetLastError()).c_str()); } } @@ -1130,7 +1195,7 @@ struct llama_mlock { } bool raw_lock(const void * addr, size_t len) const { - fprintf(stderr, "warning: mlock not supported on this system\n"); + LLAMA_LOG_WARN("warning: mlock not supported on this system\n"); return false; } @@ -1138,12 +1203,6 @@ struct llama_mlock { #endif }; -typedef void (*offload_func_t)(struct lm_ggml_tensor * tensor); - -static void lm_ggml_offload_nop(struct lm_ggml_tensor * tensor) { - (void) tensor; -} - static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -1159,6 +1218,62 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } +static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { + lm_ggml_backend_buffer_type_t buft = nullptr; + +#if defined(LM_GGML_USE_CUBLAS) + // host buffers should only be used when data is expected to be copied to/from the GPU + if (host_buffer) { + buft = lm_ggml_backend_cuda_host_buffer_type(); + } +#elif defined(LM_GGML_USE_CPU_HBM) + buft = lm_ggml_backend_cpu_hbm_buffer_type(); +#endif + + if (buft == nullptr) { + buft = lm_ggml_backend_cpu_buffer_type(); + } + return buft; + + LM_GGML_UNUSED(host_buffer); +} + +static lm_ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { + lm_ggml_backend_buffer_type_t buft = nullptr; + +#ifdef LM_GGML_USE_METAL + buft = lm_ggml_backend_metal_buffer_type(); +#elif defined(LM_GGML_USE_CUBLAS) + buft = lm_ggml_backend_cuda_buffer_type(gpu); +#elif defined(LM_GGML_USE_CLBLAST) + buft = lm_ggml_backend_opencl_buffer_type(); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + + LM_GGML_UNUSED(gpu); +} + +static lm_ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { + lm_ggml_backend_buffer_type_t buft = nullptr; + +#ifdef LM_GGML_USE_CUBLAS + if (lm_ggml_backend_cuda_get_device_count() > 1) { + buft = lm_ggml_backend_cuda_split_buffer_type(tensor_split); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(fallback_gpu); + } + return buft; + + LM_GGML_UNUSED(tensor_split); +} + // // globals // @@ -1166,7 +1281,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ struct llama_state { llama_state() { #ifdef LM_GGML_USE_METAL - lm_ggml_metal_log_set_callback(log_callback, log_callback_user_data); + lm_ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data); #endif } @@ -1191,6 +1306,10 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_SMALL, + MODEL_MEDIUM, + MODEL_LARGE, + MODEL_XL, }; static const size_t kiB = 1024; @@ -1206,6 +1325,8 @@ struct llama_hparams { uint32_t n_head_kv; uint32_t n_layer; uint32_t n_rot; + uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads + uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_ff; uint32_t n_expert = 0; uint32_t n_expert_used = 0; @@ -1222,6 +1343,7 @@ struct llama_hparams { float f_clamp_kqv; float f_max_alibi_bias; + bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; if (this->n_vocab != other.n_vocab) return true; @@ -1231,6 +1353,8 @@ struct llama_hparams { if (this->n_head_kv != other.n_head_kv) return true; if (this->n_layer != other.n_layer) return true; if (this->n_rot != other.n_rot) return true; + if (this->n_embd_head_k != other.n_embd_head_k) return true; + if (this->n_embd_head_v != other.n_embd_head_v) return true; if (this->n_ff != other.n_ff) return true; if (this->n_expert != other.n_expert) return true; if (this->n_expert_used != other.n_expert_used) return true; @@ -1238,7 +1362,7 @@ struct llama_hparams { if (this->rope_finetuned != other.rope_finetuned) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; - const float EPSILON = 1e-9; + const float EPSILON = 1e-9f; if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; @@ -1252,12 +1376,12 @@ struct llama_hparams { return n_head/n_head_kv; } - uint32_t n_embd_head() const { - return n_embd/n_head; + uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads + return n_embd_head_k * n_head_kv; } - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); + uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads + return n_embd_head_v * n_head_kv; } }; @@ -1325,6 +1449,7 @@ struct llama_layer { // ff bias struct lm_ggml_tensor * ffn_down_b; // b2 struct lm_ggml_tensor * ffn_up_b; // b3 + struct lm_ggml_tensor * ffn_act; }; struct llama_kv_cell { @@ -1357,23 +1482,24 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - struct lm_ggml_context * ctx = NULL; + std::vector ctxs; + std::vector bufs; - llama_buffer buf; + size_t total_size() const { + size_t size = 0; + for (lm_ggml_backend_buffer_t buf : bufs) { + size += lm_ggml_backend_buffer_get_size(buf); + } + return size; + } ~llama_kv_cache() { - if (ctx) { + for (struct lm_ggml_context * ctx : ctxs) { lm_ggml_free(ctx); } - -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - for (size_t i = 0; i < k_l.size(); ++i) { - lm_ggml_cuda_free_data(k_l[i]); - lm_ggml_cuda_free_data(v_l[i]); - } + for (lm_ggml_backend_buffer_t buf : bufs) { + lm_ggml_backend_buffer_free(buf); } -#endif } }; @@ -1413,11 +1539,11 @@ struct llama_vocab { id special_suffix_id = 32008; id special_eot_id = 32010; - int find_bpe_rank(std::string token_left, std::string token_right) const { - LM_GGML_ASSERT(token_left.find(" ") == std::string::npos); - LM_GGML_ASSERT(token_left.find("\n") == std::string::npos); - LM_GGML_ASSERT(token_right.find(" ") == std::string::npos); - LM_GGML_ASSERT(token_right.find("\n") == std::string::npos); + int find_bpe_rank(const std::string & token_left, const std::string & token_right) const { + LM_GGML_ASSERT(token_left.find(' ') == std::string::npos); + LM_GGML_ASSERT(token_left.find('\n') == std::string::npos); + LM_GGML_ASSERT(token_right.find(' ') == std::string::npos); + LM_GGML_ASSERT(token_right.find('\n') == std::string::npos); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1450,16 +1576,32 @@ struct llama_model { std::vector layers; + llama_split_mode split_mode; + int main_gpu; int n_gpu_layers; // gguf metadata std::unordered_map lm_gguf_kv; - // context - struct lm_ggml_context * ctx = NULL; + // layer -> buffer type mapping + struct layer_buft { + layer_buft() : buft_matrix(nullptr), buft(nullptr) {} + layer_buft(lm_ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {} + layer_buft(lm_ggml_backend_buffer_type_t matrix, lm_ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {} + + lm_ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication + lm_ggml_backend_buffer_type_t buft; // everything else + }; + + layer_buft buft_input; + layer_buft buft_output; + std::vector buft_layer; + + // contexts where the model tensors metadata is stored + std::vector ctxs; - // the model memory buffer - llama_buffer buf; + // the model memory buffers for the tensor data + std::vector bufs; // model memory mapped file std::unique_ptr mapping; @@ -1475,42 +1617,33 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { - if (ctx) { + for (struct lm_ggml_context * ctx : ctxs) { lm_ggml_free(ctx); } - -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - lm_ggml_cuda_free_data(tensors_by_name[i].second); - } - lm_ggml_cuda_free_scratch(); - } -#endif - -#if defined(LM_GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - lm_ggml_cl_free_data(tensors_by_name[i].second); + for (lm_ggml_backend_buffer_t buf : bufs) { + lm_ggml_backend_buffer_free(buf); } -#endif } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { -#ifdef LM_GGML_USE_METAL - if (ctx_metal) { - lm_ggml_metal_free(ctx_metal); - } -#endif - if (alloc) { - lm_ggml_allocr_free(alloc); + lm_ggml_backend_sched_free(sched); + + for (lm_ggml_backend_t backend : backends) { + lm_ggml_backend_free(backend); } } llama_cparams cparams; + std::vector backends; +#ifdef LM_GGML_USE_METAL + lm_ggml_backend_t backend_metal = nullptr; +#endif + lm_ggml_backend_t backend_cpu = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -1541,18 +1674,14 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; - // reusable buffer for `struct lm_ggml_graph_plan.work_data` - std::vector work_buffer; - // memory buffers used to evaluate the model - llama_buffer buf_compute; - - llama_buffer buf_alloc; - lm_ggml_allocr * alloc = NULL; + std::vector buf_compute_meta; + lm_ggml_backend_sched_t sched = nullptr; + // allocator for the input tensors + lm_ggml_tallocr * alloc = nullptr; -#ifdef LM_GGML_USE_METAL - lm_ggml_metal_context * ctx_metal = NULL; -#endif + // temporary buffer for copying data to/from the backend + std::vector> buf_copy; #ifdef LM_GGML_USE_MPI lm_ggml_mpi_context * ctx_mpi = NULL; @@ -1564,18 +1693,17 @@ struct llama_context { // static bool llama_kv_cache_init( - const struct llama_hparams & hparams, struct llama_kv_cache & cache, + const llama_model & model, lm_ggml_type ktype, lm_ggml_type vtype, uint32_t n_ctx, - int n_gpu_layers, bool offload) { - const uint32_t n_embd = hparams.n_embd_gqa(); - const uint32_t n_layer = hparams.n_layer; + const struct llama_hparams & hparams = model.hparams; - const int64_t n_mem = n_layer*n_ctx; - const int64_t n_elements = n_embd*n_mem; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const int64_t n_layer = hparams.n_layer; cache.has_shift = false; @@ -1586,55 +1714,65 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(lm_ggml_row_size(ktype, n_elements) + lm_ggml_row_size(vtype, n_elements) + 2u*n_layer*lm_ggml_tensor_overhead()); - memset(cache.buf.data, 0, cache.buf.size); - - struct lm_ggml_init_params params; - params.mem_size = cache.buf.size; - params.mem_buffer = cache.buf.data; - params.no_alloc = false; - - cache.ctx = lm_ggml_init(params); +#ifdef LM_GGML_USE_CLBLAST + offload = false; +#endif - size_t vram_kv_cache = 0; + // count used buffer types + std::map buft_layer_count; + if (offload) { + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + } + } else { + buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + } - if (!cache.ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); - return false; + // create a context for each buffer type + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct lm_ggml_init_params params = { + /*.mem_size =*/ 2u*n_layers*lm_ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + lm_ggml_context * ctx = lm_ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__); + return false; + } + ctx_map[it.first] = ctx; + cache.ctxs.push_back(ctx); } cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; LM_GGML_UNUSED(i_gpu_start); - - LM_GGML_UNUSED(offload); - for (int i = 0; i < (int) n_layer; i++) { - lm_ggml_tensor * k = lm_ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx); - lm_ggml_tensor * v = lm_ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx); + struct lm_ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); + lm_ggml_tensor * v = lm_ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); lm_ggml_format_name(k, "cache_k_l%d", i); lm_ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#ifdef LM_GGML_USE_CUBLAS - if (i >= i_gpu_start) { - if (offload) { - lm_ggml_cuda_assign_buffers_no_scratch(k); - vram_kv_cache += lm_ggml_nbytes(k); - lm_ggml_cuda_assign_buffers_no_scratch(v); - vram_kv_cache += lm_ggml_nbytes(v); - } - } -#endif // LM_GGML_USE_CUBLAS } - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + lm_ggml_backend_buffer_type_t buft = it.first; + lm_ggml_context * ctx = it.second; + lm_ggml_backend_buffer_t buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + lm_ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - LM_GGML_UNUSED(n_gpu_layers); - return true; } @@ -1816,6 +1954,28 @@ static void llama_kv_cache_seq_shift( cache.head = new_head != cache.size ? new_head : 0; } +static void llama_kv_cache_seq_div( + struct llama_kv_cache & cache, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d) { + if (p0 < 0) p0 = 0; + if (p1 < 0) p1 = std::numeric_limits::max(); + + for (uint32_t i = 0; i < cache.size; ++i) { + if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { + cache.has_shift = true; + + { + llama_pos p_old = cache.cells[i].pos; + cache.cells[i].pos /= d; + cache.cells[i].delta += cache.cells[i].pos - p_old; + } + } + } +} + // // model loading and saving // @@ -1936,13 +2096,13 @@ namespace GGUFMeta { __func__, override_type_to_str(override->tag), override->key); switch (override->tag) { case LLAMA_KV_OVERRIDE_BOOL: { - printf("%s\n", override->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_INT: { - printf("%" PRId64 "\n", override->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); } break; case LLAMA_KV_OVERRIDE_FLOAT: { - printf("%.6f\n", override->float_value); + LLAMA_LOG_INFO("%.6f\n", override->float_value); } break; default: // Shouldn't be possible to end up here, but just in case... @@ -2041,6 +2201,11 @@ struct llama_model_loader { LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { + int trace = 0; + if (getenv("LLAMA_TRACE")) { + trace = atoi(getenv("LLAMA_TRACE")); + } + struct lm_gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx_meta, @@ -2084,17 +2249,19 @@ struct llama_model_loader { enum lm_ggml_type type_max = LM_GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - const char * name = lm_gguf_get_tensor_name(ctx_gguf, i); - struct lm_ggml_tensor * meta = lm_ggml_get_tensor(ctx_meta, name); + enum lm_ggml_type type = lm_gguf_get_tensor_type(ctx_gguf, i); - n_type[meta->type]++; + n_type[type]++; - if (n_type_max < n_type[meta->type]) { - n_type_max = n_type[meta->type]; - type_max = meta->type; + if (n_type_max < n_type[type]) { + n_type_max = n_type[type]; + type_max = type; } - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, lm_ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str()); + if (trace > 0) { + struct lm_ggml_tensor * meta = lm_ggml_get_tensor(ctx_meta, lm_gguf_get_tensor_name(ctx_gguf, i)); + LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, lm_ggml_get_name(meta), lm_ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + } } switch (type_max) { @@ -2110,6 +2277,8 @@ struct llama_model_loader { case LM_GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break; case LM_GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case LM_GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; + case LM_GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; + case LM_GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, lm_ggml_type_name(type_max)); @@ -2232,40 +2401,24 @@ struct llama_model_loader { return lm_gguf_get_tensor_name(ctx_gguf, i); } - struct lm_ggml_tensor * get_tensor_meta(int i) const { - return lm_ggml_get_tensor(ctx_meta, get_tensor_name(i)); + struct lm_ggml_tensor * get_tensor_meta(const char * name) const { + return lm_ggml_get_tensor(ctx_meta, name); } - void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { - ctx_size_p = 0; - mmapped_size_p = 0; - - for (int i = 0; i < n_tensors; i++) { - struct lm_ggml_tensor * meta = get_tensor_meta(i); - ctx_size_p += sizeof(struct lm_ggml_tensor) + LM_GGML_OBJECT_SIZE; - (use_mmap ? mmapped_size_p : ctx_size_p) += lm_ggml_nbytes_pad(meta); - } + struct lm_ggml_tensor * get_tensor_meta(int i) const { + return get_tensor_meta(get_tensor_name(i)); } - struct lm_ggml_tensor * create_tensor_for(struct lm_ggml_context * ctx, struct lm_ggml_tensor * meta, lm_ggml_backend_type backend) { - if (backend != LM_GGML_BACKEND_CPU) { - lm_ggml_set_no_alloc(ctx, true); - } - + struct lm_ggml_tensor * create_tensor_for(struct lm_ggml_context * ctx, struct lm_ggml_tensor * meta) { struct lm_ggml_tensor * tensor = lm_ggml_dup_tensor(ctx, meta); - tensor->backend = backend; // TODO: lm_ggml_set_backend lm_ggml_set_name(tensor, lm_ggml_get_name(meta)); - if (backend != LM_GGML_BACKEND_CPU) { - lm_ggml_set_no_alloc(ctx, use_mmap); - } - n_created++; return tensor; } - struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, lm_ggml_backend_type backend, bool required = true) { + struct lm_ggml_tensor * create_tensor(struct lm_ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -2275,12 +2428,6 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } - if (backend == LM_GGML_BACKEND_GPU_SPLIT) { - if (ne.size() == 1) { - throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); - } - } - { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2298,7 +2445,7 @@ struct llama_model_loader { } } - return create_tensor_for(ctx, cur, backend); + return create_tensor_for(ctx, cur); } void done_getting_tensors() const { @@ -2317,93 +2464,126 @@ struct llama_model_loader { return lm_gguf_get_data_offset(ctx_gguf) + lm_gguf_get_tensor_offset(ctx_gguf, idx); } - void load_data_for(struct lm_ggml_tensor * cur) const { - const size_t offs = file_offset(lm_ggml_get_name(cur)); - + void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { + // prefetch the whole file - all the data is needed anyway if (use_mmap) { - cur->data = (uint8_t *) mapping->addr + offs; - } else { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, lm_ggml_nbytes(cur)); + mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, lm_ggml_is_numa())); } - } - - void load_all_data(struct lm_ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { - size_t size_data = 0; - size_t size_lock = 0; - size_t size_pref = 0; // prefetch + // compute the total size of all tensors for progress reporting for (int i = 0; i < lm_gguf_get_n_tensors(ctx_gguf); i++) { - struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx, lm_gguf_get_tensor_name(ctx_gguf, i)); + struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx_meta, lm_gguf_get_tensor_name(ctx_gguf, i)); size_data += lm_ggml_nbytes(cur); - if (cur->backend == LM_GGML_BACKEND_CPU) { - size_pref += lm_ggml_nbytes(cur); - } } - if (use_mmap) { - mapping.reset(new llama_mmap(&file, size_pref, lm_ggml_is_numa())); + if (use_mmap && mapping) { if (lmlock) { lmlock->init(mapping->addr); } + mmap_used_first = mapping->size; + } + } + + void get_mapping_range(size_t * first, size_t * last, lm_ggml_context * ctx) const { + LM_GGML_ASSERT(mapping); + + *first = mapping->size; + *last = 0; + for (lm_ggml_tensor * tensor = lm_ggml_get_first_tensor(ctx); tensor; tensor = lm_ggml_get_next_tensor(ctx, tensor)) { + const size_t offs = file_offset(lm_ggml_get_name(tensor)); + *first = std::min(*first, offs); + *last = std::max(*last, offs + lm_ggml_nbytes(tensor)); + } + } + + // for backwards compatibility, does not support ggml-backend + void load_data_for(struct lm_ggml_tensor * cur) const { + const size_t offs = file_offset(lm_ggml_get_name(cur)); + + if (use_mmap && mapping) { + if (cur->data == nullptr) { + cur->data = (uint8_t *)mapping->addr + offs; + } else { + memcpy(cur->data, (uint8_t *)mapping->addr + offs, lm_ggml_nbytes(cur)); + } + } else { + LM_GGML_ASSERT(cur->data != nullptr); + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, lm_ggml_nbytes(cur)); } + } + + size_t size_done = 0; + size_t size_data = 0; + size_t mmap_used_first = -1; + size_t mmap_used_last = 0; + + // Returns false if cancelled by progress_callback + bool load_all_data(struct lm_ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, lm_ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { + LM_GGML_ASSERT(size_data != 0 && "call init_mapping() first"); + + std::vector> read_buf; - size_t done_size = 0; for (int i = 0; i < lm_gguf_get_n_tensors(ctx_gguf); i++) { struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx, lm_gguf_get_tensor_name(ctx_gguf, i)); - LM_GGML_ASSERT(cur); // unused tensors should have been caught by load_data already - - if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); + if (!cur) { + // some tensors may be allocated in a different context + continue; } - // allocate temp buffer if not using mmap - if (!use_mmap && cur->data == NULL) { - LM_GGML_ASSERT(cur->backend != LM_GGML_BACKEND_CPU); - #ifdef LM_GGML_USE_CPU_HBM - cur->data = (uint8_t*)hbw_malloc(lm_ggml_nbytes(cur)); - #else - cur->data = (uint8_t*)malloc(lm_ggml_nbytes(cur)); - #endif + if (progress_callback) { + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } } - load_data_for(cur); + const size_t offs = file_offset(lm_ggml_get_name(cur)); - switch (cur->backend) { - case LM_GGML_BACKEND_CPU: - if (use_mmap && lmlock) { - size_lock += lm_ggml_nbytes(cur); - lmlock->grow_to(size_lock); - } - break; -#ifdef LM_GGML_USE_CUBLAS - case LM_GGML_BACKEND_GPU: - case LM_GGML_BACKEND_GPU_SPLIT: - // old code: - //lm_ggml_cuda_transform_tensor(lt.data, lt.lm_ggml_tensor); - - // TODO: test if this works !! - lm_ggml_cuda_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); - } - break; -#elif defined(LM_GGML_USE_CLBLAST) - case LM_GGML_BACKEND_GPU: - lm_ggml_cl_transform_tensor(cur->data, cur); - if (!use_mmap) { - free(cur->data); + if (use_mmap && mapping) { + if (buf_mmap && cur->data == nullptr) { + lm_ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + lm_ggml_nbytes(cur)); } - break; -#endif - default: - continue; + mmap_used_first = std::min(mmap_used_first, offs); + mmap_used_last = std::max(mmap_used_last, offs + lm_ggml_nbytes(cur)); + } else { + lm_ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, lm_ggml_nbytes(cur)); + } + } else { + if (lm_ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, lm_ggml_nbytes(cur)); + } else { + read_buf.resize(lm_ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), lm_ggml_nbytes(cur)); + lm_ggml_backend_tensor_set(cur, read_buf.data(), 0, lm_ggml_nbytes(cur)); + } } - done_size += lm_ggml_nbytes(cur); + size_done += lm_ggml_nbytes(cur); } - } -}; + + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_used_first); + if (mmap_used_last != 0) { + mapping->unmap_fragment(mmap_used_last, mapping->size); + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } + } + + return true; + } +}; // // load LLaMA models @@ -2434,7 +2614,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; @@ -2443,6 +2624,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; default: return "unknown, may not work"; } @@ -2450,18 +2633,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { - case MODEL_1B: return "1B"; - case MODEL_3B: return "3B"; - case MODEL_7B: return "7B"; - case MODEL_8B: return "8B"; - case MODEL_13B: return "13B"; - case MODEL_15B: return "15B"; - case MODEL_30B: return "30B"; - case MODEL_34B: return "34B"; - case MODEL_40B: return "40B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - default: return "?B"; + case MODEL_1B: return "1B"; + case MODEL_3B: return "3B"; + case MODEL_7B: return "7B"; + case MODEL_8B: return "8B"; + case MODEL_13B: return "13B"; + case MODEL_15B: return "15B"; + case MODEL_30B: return "30B"; + case MODEL_34B: return "34B"; + case MODEL_40B: return "40B"; + case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; + case MODEL_SMALL: return "0.1B"; + case MODEL_MEDIUM: return "0.4B"; + case MODEL_LARGE: return "0.8B"; + case MODEL_XL: return "1.5B"; + default: return "?B"; } } @@ -2553,6 +2740,12 @@ static void llm_load_hparams( // gpt-j n_rot = rotary_dim } + hparams.n_embd_head_k = hparams.n_embd / hparams.n_head; + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); + + hparams.n_embd_head_v = hparams.n_embd / hparams.n_head; + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); + // arch-specific KVs switch (model.arch) { case LLM_ARCH_LLAMA: @@ -2667,10 +2860,31 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_PLAMO: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_13B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_GPT2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 12: model.type = e_model::MODEL_SMALL; break; + case 24: model.type = e_model::MODEL_MEDIUM; break; + case 36: model.type = e_model::MODEL_LARGE; break; + case 48: model.type = e_model::MODEL_XL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -2943,8 +3157,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head); LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); + LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k); + LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v); LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -2959,7 +3177,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); - LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + if (ml.n_elements >= 1e12) { + LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12); + } else if (ml.n_elements >= 1e9) { + LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); + } else if (ml.n_elements >= 1e6) { + LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6); + } else { + LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3); + } if (ml.n_bytes < GiB) { LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements); } else { @@ -2978,10 +3204,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static void llm_load_tensors( +// Returns false if cancelled by progress_callback +static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, + enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, bool use_mlock, @@ -2989,748 +3217,574 @@ static void llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = lm_ggml_time_us(); - auto & ctx = model.ctx; auto & hparams = model.hparams; + model.split_mode = split_mode; + model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - size_t ctx_size; - size_t mmapped_size; + const int64_t n_layer = hparams.n_layer; + const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); + + // there is very little benefit to offloading the input layer, so always keep it on the CPU + model.buft_input = llama_default_buffer_type_cpu(true); + + model.buft_layer.resize(n_layer); - ml.calc_sizes(ctx_size, mmapped_size); + // assign cpu layers + for (int64_t i = 0; i < i_gpu_start; ++i) { + model.buft_layer[i] = llama_default_buffer_type_cpu(true); + } + +#ifdef LM_GGML_USE_CUBLAS + if (split_mode == LLAMA_SPLIT_LAYER) { + // calculate the split points + int device_count = lm_ggml_backend_cuda_get_device_count(); + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); + float splits[LM_GGML_CUDA_MAX_DEVICES]; + if (all_zero) { + // default split, by free memory + for (int i = 0; i < device_count; ++i) { + size_t total; + size_t free; + lm_ggml_backend_cuda_get_device_memory(i, &total, &free); + splits[i] = free; + } + } else { + std::copy(tensor_split, tensor_split + device_count, splits); + } - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + // sum and normalize the splits to get the split points + float split_sum = 0.0f; + for (int i = 0; i < device_count; ++i) { + split_sum += splits[i]; + splits[i] = split_sum; + } + for (int i = 0; i < device_count; ++i) { + splits[i] /= split_sum; + } - // create the ggml context + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + } + // assign the output layer + if (n_gpu_layers > n_layer) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + model.buft_output = llama_default_buffer_type_offload(layer_gpu); + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } else +#endif { - model.buf.resize(ctx_size); - if (use_mlock) { - model.mlock_buf.init (model.buf.data); - model.mlock_buf.grow_to(model.buf.size); + lm_ggml_backend_buffer_type_t split_buft; + if (split_mode == LLAMA_SPLIT_ROW) { + split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + } else { + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + split_buft = llama_default_buffer_type_offload(main_gpu); + } + // assign the repeating layers + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + model.buft_layer[i] = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } + // assign the output layer + if (n_gpu_layers > n_layer) { + model.buft_output = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } else { + model.buft_output = llama_default_buffer_type_cpu(true); } + } + + // count used buffer types + std::map buft_layer_count; + buft_layer_count[model.buft_input.buft]++; + buft_layer_count[model.buft_input.buft_matrix]++; + buft_layer_count[model.buft_output.buft]++; + buft_layer_count[model.buft_output.buft_matrix]++; + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } + // create one context per buffer type + size_t ctx_size = lm_ggml_tensor_overhead()*ml.n_tensors; + std::map ctx_map; + for (auto & it : buft_layer_count) { struct lm_ggml_init_params params = { - /*.mem_size =*/ model.buf.size, - /*.mem_buffer =*/ model.buf.data, - /*.no_alloc =*/ ml.use_mmap, + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, }; - - model.ctx = lm_ggml_init(params); - if (!model.ctx) { - throw std::runtime_error(format("lm_ggml_init() failed")); + lm_ggml_context * ctx = lm_ggml_init(params); + if (!ctx) { + throw std::runtime_error(format("failed to create context")); } + ctx_map[it.first] = ctx; + model.ctxs.push_back(ctx); } - (void) main_gpu; + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0); - enum lm_ggml_backend_type llama_backend_offload = LM_GGML_BACKEND_CPU; - enum lm_ggml_backend_type llama_backend_offload_split = LM_GGML_BACKEND_CPU; + // create tensors for the weights + { + const int64_t n_embd = hparams.n_embd; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const int64_t n_embd_gqa = n_embd_v_gqa; + const int64_t n_vocab = hparams.n_vocab; + const int64_t n_ff = hparams.n_ff; -#ifdef LM_GGML_USE_CUBLAS - if (lm_ggml_cublas_loaded()) { - LLAMA_LOG_INFO("%s: using " LM_GGML_CUDA_NAME " for GPU acceleration\n", __func__); - lm_ggml_cuda_set_main_device(main_gpu); + LM_GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - llama_backend_offload = LM_GGML_BACKEND_GPU; - llama_backend_offload_split = LM_GGML_BACKEND_GPU_SPLIT; - } -#elif defined(LM_GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = LM_GGML_BACKEND_GPU; - llama_backend_offload_split = LM_GGML_BACKEND_GPU; -#endif + lm_ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); + lm_ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); + lm_ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); + auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); }; + auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; - // prepare memory for the weights - size_t vram_weights = 0; - { - const int64_t n_embd = hparams.n_embd; - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const int64_t n_layer = hparams.n_layer; - const int64_t n_vocab = hparams.n_vocab; + model.layers.resize(n_layer); const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false); - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false); - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); if (layer.ffn_gate_inp == nullptr) { LM_GGML_ASSERT(hparams.n_expert == 0); LM_GGML_ASSERT(hparams.n_expert_used == 0); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { LM_GGML_ASSERT(hparams.n_expert > 0); LM_GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); - layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); - layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); - } - } - - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + - (layer.bq ? lm_ggml_nbytes(layer.bq) : 0) + - (layer.bk ? lm_ggml_nbytes(layer.bk) : 0) + - (layer.bv ? lm_ggml_nbytes(layer.bv) : 0) + - (layer.bo ? lm_ggml_nbytes(layer.bo) : 0) + - lm_ggml_nbytes(layer.ffn_norm); - - if (layer.ffn_gate_inp == nullptr) { - vram_weights += - lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); - } else { - vram_weights += lm_ggml_nbytes(layer.ffn_gate_inp); - for (uint32_t x = 0; x < hparams.n_expert; ++x) { - vram_weights += - lm_ggml_nbytes(layer.ffn_gate_exp[x]) + lm_ggml_nbytes(layer.ffn_down_exp[x]) + lm_ggml_nbytes(layer.ffn_up_exp[x]); - } + layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); - } + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_FALCON: { - // TODO: CPU-only for now - - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - vram_weights += lm_ggml_nbytes(model.output_norm_b); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); if (lm_gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); - layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); - - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(layer.attn_norm_2); - vram_weights += lm_ggml_nbytes(layer.attn_norm_2_b); - } + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.attn_norm_b) + - lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.wo) + - lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); - } + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_STARCODER: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - vram_weights += lm_ggml_nbytes(model.output_norm_b); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.attn_norm_b) + - lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + - lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.bo) + - lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_norm_b) + - lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_down_b) + - lm_ggml_nbytes(layer.ffn_up) + lm_ggml_nbytes(layer.ffn_up_b); - } + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + auto & layer = model.layers[i]; - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - vram_weights += lm_ggml_nbytes(model.output_norm_b); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } - } + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - const uint32_t n_ff = hparams.n_ff; - const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; - auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); - layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; case LLM_ARCH_BLOOM: { - // TODO: CPU-only for now - - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, LM_GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - vram_weights += lm_ggml_nbytes(model.output_norm_b); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.attn_norm_b) + - lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + - lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.bo) + - lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_norm_b) + - lm_ggml_nbytes(layer.ffn_up) + lm_ggml_nbytes(layer.ffn_up_b) + - lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_down_b); - } + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_MPT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + - lm_ggml_nbytes(layer.wqkv) + - lm_ggml_nbytes(layer.wo) + - lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.ffn_down) + - lm_ggml_nbytes(layer.ffn_up); - } + // AWQ ScaleActivation layer + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } } break; case LLM_ARCH_STABLELM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } - - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - /* - llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ] - */ - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wq) + lm_ggml_nbytes(layer.wk) + - lm_ggml_nbytes(layer.wv) + lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + - lm_ggml_nbytes(layer.ffn_gate) + lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); - } + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_QWEN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + auto & layer = model.layers[i]; - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - } - if (backend_output == LM_GGML_BACKEND_GPU_SPLIT) { - vram_weights += lm_ggml_nbytes(model.output); - } - } + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - const uint32_t n_ff = hparams.n_ff / 2; + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - const int i_gpu_start = n_layer - n_gpu_layers; + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); + } + } break; + case LLM_ARCH_PHI2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - model.layers.resize(n_layer); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}); + } - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + if (layer.wqkv == nullptr) { + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + - lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.ffn_norm) + lm_ggml_nbytes(layer.ffn_gate) + - lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_up); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); } + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; - case LLM_ARCH_PHI2: + case LLM_ARCH_PLAMO: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, LM_GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - lm_ggml_backend_type backend_norm; - lm_ggml_backend_type backend_output; + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload; - } else { - backend_norm = LM_GGML_BACKEND_CPU; - backend_output = LM_GGML_BACKEND_CPU; - } + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + auto & layer = model.layers[i]; - if (backend_norm == LM_GGML_BACKEND_GPU) { - vram_weights += lm_ggml_nbytes(model.output_norm); - vram_weights += lm_ggml_nbytes(model.output_norm_b); - vram_weights += lm_ggml_nbytes(model.output); - vram_weights += lm_ggml_nbytes(model.output_b); - } - } + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - const uint32_t n_ff = hparams.n_ff; + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - const int i_gpu_start = n_layer - n_gpu_layers; + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + case LLM_ARCH_GPT2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); - model.layers.resize(n_layer); + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - for (uint32_t i = 0; i < n_layer; ++i) { - const lm_ggml_backend_type backend = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const lm_ggml_backend_type backend_split = int(i) < i_gpu_start ? LM_GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + lm_ggml_context * ctx_layer = ctx_for_layer(i); + lm_ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - if (backend == LM_GGML_BACKEND_GPU) { - vram_weights += - lm_ggml_nbytes(layer.attn_norm) + lm_ggml_nbytes(layer.attn_norm_b) + - lm_ggml_nbytes(layer.wqkv) + lm_ggml_nbytes(layer.bqkv) + - lm_ggml_nbytes(layer.wo) + lm_ggml_nbytes(layer.bo) + - lm_ggml_nbytes(layer.ffn_up) + lm_ggml_nbytes(layer.ffn_up_b) + - lm_ggml_nbytes(layer.ffn_down) + lm_ggml_nbytes(layer.ffn_down_b); - } + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; default: @@ -3740,16 +3794,51 @@ static void llm_load_tensors( ml.done_getting_tensors(); - // print memory requirements - { - // this is the total memory required to run the inference - size_t mem_required = - ctx_size + - mmapped_size - vram_weights; // weights in VRAM not in memory + ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); - LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0); + // create the backend buffers + std::vector> ctx_bufs; + + for (auto & it : ctx_map) { + lm_ggml_backend_buffer_type_t buft = it.first; + lm_ggml_context * ctx = it.second; + lm_ggml_backend_buffer_t buf = nullptr; + + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size + if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = lm_ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); + } +#ifdef LM_GGML_USE_METAL + else if (ml.use_mmap && buft == lm_ggml_backend_metal_buffer_type()) { + const size_t max_size = lm_ggml_get_max_tensor_size(ctx); + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = lm_ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); + } +#endif + else { + buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf != nullptr && use_mlock && lm_ggml_backend_buffer_is_host(buf)) { + model.mlock_buf.init (lm_ggml_backend_buffer_get_base(buf)); + model.mlock_buf.grow_to(lm_ggml_backend_buffer_get_size(buf)); + } + } + if (buf == nullptr) { + throw std::runtime_error("failed to allocate buffer"); + } + // indicate that this buffer contains weights + // this is used by lm_ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + lm_ggml_backend_buffer_set_usage(buf, LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf); + ctx_bufs.emplace_back(ctx, buf); + } -#if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) + // print memory requirements + { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -3757,38 +3846,30 @@ static void llm_load_tensors( LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__); } -#ifdef LM_GGML_USE_CUBLAS - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; -#elif LM_GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; -#endif // LM_GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); - LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); -#else - (void) n_gpu_layers; -#endif // defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) - } - // populate `tensors_by_name` - for (int i = 0; i < ml.n_tensors; ++i) { - struct lm_ggml_tensor * cur = lm_ggml_get_tensor(ctx, ml.get_tensor_name(i)); - model.tensors_by_name.emplace_back(lm_ggml_get_name(cur), cur); + for (lm_ggml_backend_buffer_t buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf), lm_ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + } } - (void) tensor_split; -#ifdef LM_GGML_USE_CUBLAS - { - lm_ggml_cuda_set_tensor_split(tensor_split); + // populate tensors_by_name + for (lm_ggml_context * ctx : model.ctxs) { + for (auto * cur = lm_ggml_get_first_tensor(ctx); cur != NULL; cur = lm_ggml_get_next_tensor(ctx, cur)) { + model.tensors_by_name.emplace_back(lm_ggml_get_name(cur), cur); + } } -#endif - - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // load tensor data + for (auto & it : ctx_bufs) { + lm_ggml_context * ctx = it.first; + lm_ggml_backend_buffer_t buf = it.second; + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } } model.mapping = std::move(ml.mapping); @@ -3796,9 +3877,11 @@ static void llm_load_tensors( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = lm_ggml_time_us() - model.t_start_us; + return true; } -static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3816,19 +3899,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return true; + return 0; } - llm_load_tensors( - ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + if (!llm_load_tensors( + ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - ); + )) { + return -2; + } } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); + return -1; } - return true; + return 0; } // @@ -3886,8 +3971,8 @@ static struct lm_ggml_tensor * llm_build_inp_embd( return inpL; } -// Persimmon: n_rot = n_embd_head/2 -// Other: n_rot = n_embd_head +// Persimmon: n_rot = n_embd_head_k/2 +// Other: n_rot = n_embd_head_k static void llm_build_k_shift( struct lm_ggml_context * ctx, const llama_hparams & hparams, @@ -3896,21 +3981,19 @@ static void llm_build_k_shift( struct lm_ggml_cgraph * graph, llm_rope_type type, int64_t n_ctx, - int n_rot, float freq_base, float freq_scale, const llm_build_cb & cb) { - const int64_t n_layer = hparams.n_layer; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const int64_t n_embd_head = hparams.n_embd_head(); - const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; - const float ext_factor = cparams.yarn_ext_factor; - const float attn_factor = cparams.yarn_attn_factor; - const float beta_fast = cparams.yarn_beta_fast; - const float beta_slow = cparams.yarn_beta_slow; - - LM_GGML_ASSERT(n_embd_head % n_rot == 0); + const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int32_t n_rot = hparams.n_rot; + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; + const float ext_factor = cparams.yarn_ext_factor; + const float attn_factor = cparams.yarn_attn_factor; + const float beta_fast = cparams.yarn_beta_fast; + const float beta_slow = cparams.yarn_beta_slow; struct lm_ggml_tensor * K_shift = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, n_ctx); cb(K_shift, "K_shift", -1); @@ -3928,9 +4011,9 @@ static void llm_build_k_shift( // we rotate only the first n_rot dimensions lm_ggml_rope_custom_inplace(ctx, lm_ggml_view_3d(ctx, kv.k_l[il], - n_embd_head, n_head_kv, n_ctx, - lm_ggml_row_size(kv.k_l[il]->type, n_embd_head), - lm_ggml_row_size(kv.k_l[il]->type, n_embd_gqa), + n_embd_head_k, n_head_kv, n_ctx, + lm_ggml_row_size(kv.k_l[il]->type, n_embd_head_k), + lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -3951,18 +4034,19 @@ static void llm_build_kv_store( int32_t kv_head, const llm_build_cb & cb, int64_t il) { - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); // compute the transposed [n_tokens, n_embd] V matrix - struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, lm_ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, lm_ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens)); //struct lm_ggml_tensor * v_cur_t = lm_ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed cb(v_cur_t, "v_cur_t", il); - struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, - (lm_ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head); + struct lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, + (lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head); cb(k_cache_view, "k_cache_view", il); - struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, + struct lm_ggml_tensor * v_cache_view = lm_ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, ( n_ctx)*lm_ggml_element_size(kv.v_l[il]), (kv_head)*lm_ggml_element_size(kv.v_l[il])); cb(v_cache_view, "v_cache_view", il); @@ -4013,6 +4097,7 @@ static struct lm_ggml_tensor * llm_build_ffn( struct lm_ggml_tensor * gate_b, struct lm_ggml_tensor * down, struct lm_ggml_tensor * down_b, + struct lm_ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, const llm_build_cb & cb, @@ -4057,6 +4142,10 @@ static struct lm_ggml_tensor * llm_build_ffn( { cur = lm_ggml_gelu(ctx, cur); cb(cur, "ffn_gelu", il); + if (act_scales != NULL) { + cur = lm_ggml_div(ctx, cur, act_scales); + cb(cur, "ffn_act", il); + } } break; case LLM_FFN_RELU: { @@ -4099,29 +4188,28 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * wo, struct lm_ggml_tensor * wo_b, struct lm_ggml_tensor * q_cur, - struct lm_ggml_tensor * kq_scale, struct lm_ggml_tensor * kq_mask, int64_t n_ctx, int32_t n_tokens, int32_t n_kv, float max_alibi_bias, - float scale, + float kq_scale, const llm_build_cb & cb, int il) { - const int64_t n_embd = hparams.n_embd; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_head_v = hparams.n_embd_head_v; struct lm_ggml_tensor * q = lm_ggml_permute(ctx, q_cur, 0, 2, 1, 3); cb(q, "q", il); struct lm_ggml_tensor * k = lm_ggml_view_3d(ctx, kv.k_l[il], - n_embd_head, n_kv, n_head_kv, - lm_ggml_row_size(kv.k_l[il]->type, n_embd_gqa), - lm_ggml_row_size(kv.k_l[il]->type, n_embd_head), + n_embd_head_k, n_kv, n_head_kv, + lm_ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa), + lm_ggml_row_size(kv.k_l[il]->type, n_embd_head_k), 0); cb(k, "k", il); @@ -4153,16 +4241,16 @@ static struct lm_ggml_tensor * llm_build_kqv( kq = lm_ggml_soft_max(ctx, kq); cb(kq, "kq_soft_max", il); } else { - kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, scale); + kq = lm_ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale); cb(kq, "kq_soft_max_ext", il); } // split cached v into n_head heads struct lm_ggml_tensor * v = lm_ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head, n_head_kv, + n_kv, n_embd_head_v, n_head_kv, lm_ggml_element_size(kv.v_l[il])*n_ctx, - lm_ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head, + lm_ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, 0); cb(v, "v", il); @@ -4172,7 +4260,7 @@ static struct lm_ggml_tensor * llm_build_kqv( struct lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx, kqv, 0, 2, 1, 3); cb(kqv_merged, "kqv_merged", il); - struct lm_ggml_tensor * cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + struct lm_ggml_tensor * cur = lm_ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); cb(cur, "kqv_merged_cont", il); cur = lm_ggml_mul_mat(ctx, wo, cur); @@ -4199,8 +4287,10 @@ struct llm_build_context { const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; const int64_t n_head_kv; - const int64_t n_embd_head; - const int64_t n_embd_gqa; + const int64_t n_embd_head_k; + const int64_t n_embd_k_gqa; + const int64_t n_embd_head_v; + const int64_t n_embd_v_gqa; const int64_t n_expert; const int64_t n_expert_used; @@ -4222,7 +4312,7 @@ struct llm_build_context { const llm_build_cb & cb; - llama_buffer & buf_compute; + std::vector & buf_compute_meta; struct lm_ggml_context * ctx0 = nullptr; @@ -4232,44 +4322,44 @@ struct llm_build_context { const llama_batch & batch, const llm_build_cb & cb, bool worst_case) : - model (lctx.model), - hparams (model.hparams), - cparams (lctx.cparams), - batch (batch), - kv_self (lctx.kv_self), - n_embd (hparams.n_embd), - n_layer (hparams.n_layer), - n_ctx (cparams.n_ctx), - n_head (hparams.n_head), - n_head_kv (hparams.n_head_kv), - n_embd_head (hparams.n_embd_head()), - n_embd_gqa (hparams.n_embd_gqa()), - n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), - freq_base (cparams.rope_freq_base), - freq_scale (cparams.rope_freq_scale), - ext_factor (cparams.yarn_ext_factor), - attn_factor (cparams.yarn_attn_factor), - beta_fast (cparams.yarn_beta_fast), - beta_slow (cparams.yarn_beta_slow), - norm_eps (hparams.f_norm_eps), - norm_rms_eps (hparams.f_norm_rms_eps), - n_tokens (batch.n_tokens), - n_kv (worst_case ? n_ctx : kv_self.n), - kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), - n_orig_ctx (cparams.n_yarn_orig_ctx), - do_rope_shift (worst_case || kv_self.has_shift), - cb (cb), - buf_compute (lctx.buf_compute) { - LM_GGML_ASSERT(!!kv_self.ctx); - + model (lctx.model), + hparams (model.hparams), + cparams (lctx.cparams), + batch (batch), + kv_self (lctx.kv_self), + n_embd (hparams.n_embd), + n_layer (hparams.n_layer), + n_ctx (cparams.n_ctx), + n_head (hparams.n_head), + n_head_kv (hparams.n_head_kv), + n_embd_head_k (hparams.n_embd_head_k), + n_embd_k_gqa (hparams.n_embd_k_gqa()), + n_embd_head_v (hparams.n_embd_head_v), + n_embd_v_gqa (hparams.n_embd_v_gqa()), + n_expert (hparams.n_expert), + n_expert_used (hparams.n_expert_used), + freq_base (cparams.rope_freq_base), + freq_scale (cparams.rope_freq_scale), + ext_factor (cparams.yarn_ext_factor), + attn_factor (cparams.yarn_attn_factor), + beta_fast (cparams.yarn_beta_fast), + beta_slow (cparams.yarn_beta_slow), + norm_eps (hparams.f_norm_eps), + norm_rms_eps (hparams.f_norm_rms_eps), + n_tokens (batch.n_tokens), + n_kv (worst_case ? n_ctx : kv_self.n), + kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_orig_ctx (cparams.n_yarn_orig_ctx), + do_rope_shift (worst_case || kv_self.has_shift), + cb (cb), + buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() } void init() { struct lm_ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), /*.no_alloc =*/ true, }; @@ -4286,6 +4376,8 @@ struct llm_build_context { struct lm_ggml_cgraph * build_llama() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); LM_GGML_ASSERT(n_embd_head == hparams.n_rot); struct lm_ggml_tensor * cur; @@ -4298,17 +4390,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4344,16 +4432,22 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + lm_ggml_build_forward_expand(gf, Qcur); + lm_ggml_build_forward_expand(gf, Kcur); + lm_ggml_build_forward_expand(gf, Vcur); + Qcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -4362,7 +4456,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4380,6 +4474,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } else { @@ -4473,6 +4568,10 @@ struct llm_build_context { struct lm_ggml_cgraph * build_baichuan() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -4483,17 +4582,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4519,12 +4614,12 @@ struct llm_build_context { case MODEL_7B: Qcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = lm_ggml_rope_custom( ctx0, lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -4545,7 +4640,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4563,6 +4658,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -4593,6 +4689,11 @@ struct llm_build_context { struct lm_ggml_cgraph * build_falcon() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -4603,17 +4704,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4654,13 +4751,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -4669,7 +4766,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4681,6 +4778,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4715,6 +4813,10 @@ struct llm_build_context { struct lm_ggml_cgraph * build_starcoder() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * pos; struct lm_ggml_tensor * inpL; @@ -4726,10 +4828,6 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -4769,7 +4867,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4789,6 +4887,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -4814,28 +4913,26 @@ struct llm_build_context { struct lm_ggml_cgraph * build_persimmon() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - const int64_t n_rot = n_embd_head / 2; + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head/2 == hparams.n_rot); struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); - cb(inpL, "imp_embd", -1); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4895,7 +4992,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct lm_ggml_tensor * qrot = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, lm_ggml_element_size(tmpq) * n_embd_head, lm_ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -4903,7 +5000,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct lm_ggml_tensor * krot = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, lm_ggml_element_size(tmpk) * n_embd_head, lm_ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -4912,29 +5009,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct lm_ggml_tensor * qpass = lm_ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, lm_ggml_element_size(tmpq) * n_embd_head, lm_ggml_element_size(tmpq) * n_embd_head * n_head, - lm_ggml_element_size(tmpq) * n_rot + lm_ggml_element_size(tmpq) * hparams.n_rot ); cb(qpass, "qpass", il); struct lm_ggml_tensor * kpass = lm_ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, lm_ggml_element_size(tmpk) * n_embd_head, lm_ggml_element_size(tmpk) * n_embd_head * n_head, - lm_ggml_element_size(tmpk) * n_rot + lm_ggml_element_size(tmpk) * hparams.n_rot ); cb(kpass, "kpass", il); struct lm_ggml_tensor * qrotated = lm_ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct lm_ggml_tensor * krotated = lm_ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -4978,7 +5075,7 @@ struct llm_build_context { // TODO: not tested, could be broken cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -4997,6 +5094,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5026,16 +5124,15 @@ struct llm_build_context { struct lm_ggml_cgraph * build_refact() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5069,7 +5166,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5087,6 +5184,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5117,16 +5215,16 @@ struct llm_build_context { struct lm_ggml_cgraph * build_bloom() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5166,7 +5264,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5186,6 +5284,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5211,16 +5310,16 @@ struct llm_build_context { struct lm_ggml_cgraph * build_mpt() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); cb(inpL, "inp_embd", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); @@ -5260,7 +5359,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5275,11 +5374,11 @@ struct llm_build_context { NULL, LLM_NORM, cb, il); cb(cur, "ffn_norm", il); - cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, NULL, NULL, model.layers[il].ffn_down, NULL, + model.layers[il].ffn_act, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(cur, "ffn_out", il); } @@ -5310,6 +5409,9 @@ struct llm_build_context { struct lm_ggml_cgraph * build_stablelm() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -5320,17 +5422,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5373,7 +5471,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5392,6 +5490,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5423,6 +5522,9 @@ struct llm_build_context { struct lm_ggml_cgraph * build_qwen() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * inpL; @@ -5433,17 +5535,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5475,13 +5573,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = lm_ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = lm_ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5490,7 +5588,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, NULL, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); cb(cur, "kqv_out", il); } @@ -5508,6 +5606,7 @@ struct llm_build_context { model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); cb(cur, "ffn_out", il); } @@ -5537,6 +5636,10 @@ struct llm_build_context { struct lm_ggml_cgraph * build_phi2() { struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct lm_ggml_tensor * cur; struct lm_ggml_tensor * attn_norm_output; struct lm_ggml_tensor * ffn_output; @@ -5549,21 +5652,13 @@ struct llm_build_context { struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); cb(inp_pos, "inp_pos", -1); - // Q_scale - struct lm_ggml_tensor * Q_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(Q_scale, "Q_scale", -1); - - // KQ_scale - struct lm_ggml_tensor * KQ_scale = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale", -1); - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5575,15 +5670,25 @@ struct llm_build_context { // self-attention { - cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); - cb(cur, "wqkv", il); + struct lm_ggml_tensor * Qcur = nullptr; + struct lm_ggml_tensor * Kcur = nullptr; + struct lm_ggml_tensor * Vcur = nullptr; - cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv", il); + if (model.layers[il].wqkv) { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); - struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + } else { + Qcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = lm_ggml_add(ctx0, lm_ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -5598,7 +5703,9 @@ struct llm_build_context { ); cb(Qcur, "Qcur", il); - Qcur = lm_ggml_scale(ctx0, Qcur, Q_scale); + // with phi2, we scale the Q to avoid precision issues + // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 + Qcur = lm_ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head))); cb(Qcur, "Qcur", il); Kcur = lm_ggml_rope_custom( @@ -5611,7 +5718,7 @@ struct llm_build_context { cur = llm_build_kqv(ctx0, model, hparams, kv_self, model.layers[il].wo, model.layers[il].bo, - Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il); cb(cur, "kqv_out", il); } @@ -5621,6 +5728,7 @@ struct llm_build_context { model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); cb(ffn_output, "ffn_out", il); } @@ -5650,194 +5758,215 @@ struct llm_build_context { return gf; } -}; -// -// tensor offloading helpers -// -// TODO: will be removed with backend v2 - -enum llm_offload_func_e { - OFFLOAD_FUNC_NOP, - OFFLOAD_FUNC, - OFFLOAD_FUNC_FRC, // force offload - OFFLOAD_FUNC_KQV, - OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, // embeddings - OFFLOAD_FUNC_OUT, -}; + struct lm_ggml_cgraph * build_plamo() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph(ctx0); -// TODO: will be removed with backend v2 -struct llm_offload_trie { - struct node { - ~node() { - for (int i = 0; i < 256; ++i) { - if (children[i]) { - delete children[i]; - } - } - } + const int64_t n_embd_head = hparams.n_embd_head_v; + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + LM_GGML_ASSERT(n_embd_head == hparams.n_rot); - node * children[256] = { nullptr }; - llm_offload_func_e func = OFFLOAD_FUNC_NOP; - }; + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * inpL; - llm_offload_trie() { - root = new node; - } + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); - llm_offload_trie(const std::unordered_map & map) { - root = new node; + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); - for (const auto & kv : map) { - add(kv.first, kv.second); + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } - } - ~llm_offload_trie() { - delete root; - } + for (int il = 0; il < n_layer; ++il) { - void add(const char * name, llm_offload_func_e func) { - node * cur = root; + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; + struct lm_ggml_tensor * attention_norm = cur; - if (!c) { - break; + // self-attention + { + // compute Q and K and RoPE them + struct lm_ggml_tensor * Qcur = lm_ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct lm_ggml_tensor * Kcur = lm_ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct lm_ggml_tensor * Vcur = lm_ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, + n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Kcur = lm_ggml_rope_custom( + ctx0, lm_ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, + n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, model, hparams, kv_self, + model.layers[il].wo, NULL, + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); } + struct lm_ggml_tensor * sa_out = cur; + + cur = attention_norm; - if (!cur->children[c]) { - cur->children[c] = new node; + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = cur->children[c]; + cur = lm_ggml_add(ctx0, cur, sa_out); + cb(cur, "l_out", il); + + cur = lm_ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } - cur->func = func; + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + lm_ggml_build_forward_expand(gf, cur); + + return gf; } - llm_offload_func_e find(const char * name) const { - const node * cur = root; + struct lm_ggml_cgraph * build_gpt2() { + struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - if (!c) { - break; + struct lm_ggml_tensor * cur; + struct lm_ggml_tensor * pos; + struct lm_ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct lm_ggml_tensor * inp_pos = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct lm_ggml_tensor * KQ_mask = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask", -1); + + pos = lm_ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = lm_ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = lm_ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = lm_ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct lm_ggml_tensor * Qcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Kcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct lm_ggml_tensor * Vcur = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); + + cur = llm_build_kqv(ctx0, model, hparams, kv_self, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); } - if (!cur->children[c]) { - return OFFLOAD_FUNC_NOP; + // add the input + struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = cur->children[c]; + inpL = lm_ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - return cur->func; - } + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); - node * root = nullptr; -}; + cur = lm_ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); -// TODO: will be removed with backend v2 -static const std::unordered_map k_offload_map = { - //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "pos_embd", OFFLOAD_FUNC_NR }, - - { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "Q_scale", OFFLOAD_FUNC_FRC }, - { "KQ_scale", OFFLOAD_FUNC_FRC }, - { "KQ_mask", OFFLOAD_FUNC_FRC }, - { "K_shift", OFFLOAD_FUNC_FRC }, - - { "K_shifted", OFFLOAD_FUNC }, - - { "inp_norm", OFFLOAD_FUNC_NR }, - { "inp_norm_w", OFFLOAD_FUNC_NR }, - { "inp_norm_wb", OFFLOAD_FUNC_NR }, - - { "norm", OFFLOAD_FUNC }, - { "norm_w", OFFLOAD_FUNC }, - { "norm_wb", OFFLOAD_FUNC }, - - { "attn_norm", OFFLOAD_FUNC }, - { "attn_norm_2", OFFLOAD_FUNC }, - - { "wqkv", OFFLOAD_FUNC_KQV }, - { "bqkv", OFFLOAD_FUNC_KQV }, - { "wqkv_clamped", OFFLOAD_FUNC_KQV }, - - { "tmpk", OFFLOAD_FUNC_KQV }, - { "tmpq", OFFLOAD_FUNC_KQV }, - { "tmpv", OFFLOAD_FUNC_KQV }, - { "Kcur", OFFLOAD_FUNC_KQV }, - { "Qcur", OFFLOAD_FUNC_KQV }, - { "Vcur", OFFLOAD_FUNC_KQV }, - - { "krot", OFFLOAD_FUNC_KQV }, - { "qrot", OFFLOAD_FUNC_KQV }, - { "kpass", OFFLOAD_FUNC_KQV }, - { "qpass", OFFLOAD_FUNC_KQV }, - { "krotated", OFFLOAD_FUNC_KQV }, - { "qrotated", OFFLOAD_FUNC_KQV }, - - { "q", OFFLOAD_FUNC_KQV }, - { "k", OFFLOAD_FUNC_KQV }, - { "kq", OFFLOAD_FUNC_KQV }, - { "kq_scaled", OFFLOAD_FUNC_KQV }, - { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, - { "kq_masked", OFFLOAD_FUNC_KQV }, - { "kq_soft_max", OFFLOAD_FUNC_KQV }, - { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, - { "v", OFFLOAD_FUNC_KQV }, - { "kqv", OFFLOAD_FUNC_KQV }, - { "kqv_merged", OFFLOAD_FUNC_KQV }, - { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, - { "kqv_wo", OFFLOAD_FUNC_KQV }, - { "kqv_out", OFFLOAD_FUNC_KQV }, - - { "ffn_inp", OFFLOAD_FUNC }, - { "ffn_norm", OFFLOAD_FUNC }, - - { "ffn_up", OFFLOAD_FUNC }, - { "ffn_up_b", OFFLOAD_FUNC }, - { "ffn_gate", OFFLOAD_FUNC }, - { "ffn_gate_b", OFFLOAD_FUNC }, - { "ffn_gate_par", OFFLOAD_FUNC }, - { "ffn_down", OFFLOAD_FUNC }, - { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_out", OFFLOAD_FUNC }, - - { "ffn_silu", OFFLOAD_FUNC }, - { "ffn_gelu", OFFLOAD_FUNC }, - { "ffn_relu", OFFLOAD_FUNC }, - { "ffn_sqr(relu)", OFFLOAD_FUNC }, - - { "ffn_moe_logits", OFFLOAD_FUNC }, - { "ffn_moe_probs", OFFLOAD_FUNC }, - { "ffn_moe_argsort", OFFLOAD_FUNC }, - { "ffn_moe_weights", OFFLOAD_FUNC }, - { "ffn_moe_weights_sum", OFFLOAD_FUNC }, - { "ffn_moe_weights_norm", OFFLOAD_FUNC }, - { "ffn_moe_weighted", OFFLOAD_FUNC }, - { "ffn_moe_up", OFFLOAD_FUNC }, - { "ffn_moe_gate", OFFLOAD_FUNC }, - { "ffn_moe_silu", OFFLOAD_FUNC }, - { "ffn_moe_gate_par", OFFLOAD_FUNC }, - { "ffn_moe_down", OFFLOAD_FUNC }, - { "ffn_moe_out", OFFLOAD_FUNC }, - - { "l_out", OFFLOAD_FUNC }, - - { "result_norm", OFFLOAD_FUNC_EMB }, - { "result_output_no_bias", OFFLOAD_FUNC_EMB }, - { "result_output", OFFLOAD_FUNC_OUT }, -}; + lm_ggml_build_forward_expand(gf, cur); -static llm_offload_trie k_offload_func_trie(k_offload_map); + return gf; + } +}; static struct lm_ggml_cgraph * llama_build_graph( llama_context & lctx, @@ -5845,27 +5974,17 @@ static struct lm_ggml_cgraph * llama_build_graph( const auto & model = lctx.model; // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = lm_ggml_allocr_is_measure(lctx.alloc); + const bool worst_case = lm_ggml_tallocr_is_measure(lctx.alloc); // keep track of the input that has already been allocated bool alloc_inp_tokens = false; bool alloc_inp_embd = false; bool alloc_inp_pos = false; - bool alloc_inp_Q_scale = false; - bool alloc_inp_KQ_scale = false; bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#ifdef LM_GGML_USE_CUBLAS - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif - - int n_non_view = 0; // number of non-view tensors that have been processed by the callback - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: will be removed with backend v2 + // TODO: improve handling of input and output tensors, then replace this with lm_ggml_set_name llm_build_cb cb = [&](struct lm_ggml_tensor * cur, const char * name, int il) { if (il >= 0) { lm_ggml_format_name(cur, "%s-%d", name, il); @@ -5876,86 +5995,59 @@ static struct lm_ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // - // TODO: will be removed with backend v2 if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); + lm_ggml_tallocr_alloc(lctx.alloc, cur); - if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.token) { + if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; - memcpy(cur->data, batch.token, n_tokens*lm_ggml_element_size(cur)); + lm_ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*lm_ggml_element_size(cur)); } alloc_inp_tokens = true; } - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { + lm_ggml_tallocr_alloc(lctx.alloc, cur); - if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - memcpy(cur->data, batch.embd, n_tokens*n_embd*lm_ggml_element_size(cur)); + lm_ggml_backend_tensor_set(cur, batch.embd, 0, n_tokens*n_embd*lm_ggml_element_size(cur)); } alloc_inp_embd = true; } if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); + lm_ggml_tallocr_alloc(lctx.alloc, cur); - if (!lm_ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + if (!lm_ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } + static_assert(std::is_same::value, "llama_pos must be int32_t"); + lm_ggml_backend_tensor_set(cur, batch.pos, 0, n_tokens*lm_ggml_element_size(cur)); } alloc_inp_pos = true; } - if (!alloc_inp_Q_scale && strcmp(name, "Q_scale") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); - - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = model.hparams.n_embd_head(); - lm_ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); - } - - alloc_inp_Q_scale = true; - } - - if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); - - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = model.hparams.n_embd_head(); - if (model.arch == LLM_ARCH_PHI2) { - // with phi2, we scale the Q to avoid precision issues - // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66 - lm_ggml_set_f32(cur, 1.0f); - } else { - lm_ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); - } - } - - alloc_inp_KQ_scale = true; - } - if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); + lm_ggml_tallocr_alloc(lctx.alloc, cur); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + if (!lm_ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; - float * data = (float *) cur->data; - memset(data, 0, lm_ggml_nbytes(cur)); + float * data; + if (lm_ggml_backend_buffer_is_host(cur->buffer)) { + data = (float *) cur->data; + } else { + lctx.buf_copy.resize(lm_ggml_nbytes(cur)); + data = (float *) lctx.buf_copy.data(); + } for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { @@ -5963,162 +6055,50 @@ static struct lm_ggml_cgraph * llama_build_graph( const llama_seq_id seq_id = batch.seq_id[j][0]; for (int i = 0; i < n_kv; ++i) { + float f; if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + f = -INFINITY; + } else { + f = 0; } + data[h*(n_kv*n_tokens) + j*n_kv + i] = f; } } } + + if (data != cur->data) { + lm_ggml_backend_tensor_set(cur, data, 0, lm_ggml_nbytes(cur)); + } } alloc_inp_KQ_mask = true; } if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - lm_ggml_allocr_alloc(lctx.alloc, cur); + lm_ggml_tallocr_alloc(lctx.alloc, cur); - if (!lm_ggml_allocr_is_measure(lctx.alloc)) { + if (!lm_ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; + int32_t * data; + if (lm_ggml_backend_buffer_is_host(cur->buffer)) { + data = (int32_t *) cur->data; + } else { + lctx.buf_copy.resize(lm_ggml_nbytes(cur)); + data = (int32_t *) lctx.buf_copy.data(); + } for (int i = 0; i < n_ctx; ++i) { data[i] = lctx.kv_self.cells[i].delta; } - } - - alloc_inp_K_shift = true; - } - - // view tensors are not processed further - if (cur->view_src != nullptr) { - return; - } - - if (cur->op != LM_GGML_OP_NONE) { - n_non_view++; - } - - // - // offload layers - // - // TODO: will be removed with backend v2 - -//#define LLAMA_OFFLOAD_DEBUG - - if (!do_offload) { - return; - } - - const int n_layer = model.hparams.n_layer; - - const int n_gpu_layers = model.n_gpu_layers; - const int i_gpu_start = n_layer - n_gpu_layers; - - // should we offload the final norm? yes if we are not computing embeddings - const bool offload_emb = lctx.embedding.empty(); - - static const std::unordered_map> k_offload_func_name = { - { OFFLOAD_FUNC_NOP, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, -#ifdef LM_GGML_USE_CUBLAS - { OFFLOAD_FUNC, "GPU (CUDA)" }, - { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, - { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, - { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, - { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, -#else - { OFFLOAD_FUNC, "CPU" }, - { OFFLOAD_FUNC_FRC, "CPU" }, - { OFFLOAD_FUNC_KQV, "CPU" }, - { OFFLOAD_FUNC_NR, "CPU" }, - { OFFLOAD_FUNC_EMB, "CPU" }, -#endif // LM_GGML_USE_CUBLAS - }; - - // check the global map for what offload function to use for this tensor - llm_offload_func_e func_e = k_offload_func_trie.find(name); - if (func_e == OFFLOAD_FUNC_NOP) { -#ifdef LLAMA_OFFLOAD_DEBUG - // if a tensor hasn't been offloaded, we warn the user - if (worst_case) { - LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, - cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); - } -#endif - - return; - } - - // count the number of layers and respect the provided n_gpu_layers - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: - break; - case OFFLOAD_FUNC: - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } + if (data != cur->data) { + lm_ggml_backend_tensor_set(cur, data, 0, lm_ggml_nbytes(cur)); } - break; - case OFFLOAD_FUNC_FRC: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } break; - case OFFLOAD_FUNC_KQV: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } else { - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - } - break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_EMB: - if (!offload_emb || n_gpu_layers < n_layer) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - default: LM_GGML_ASSERT(false); - } - - offload_func_t func = lm_ggml_offload_nop; - - // this is needed for compatibility with Metal for example -#ifdef LM_GGML_USE_CUBLAS - static offload_func_t lm_ggml_offload_gpu = lm_ggml_cuda_assign_buffers_no_alloc; -#else - static offload_func_t lm_ggml_offload_gpu = lm_ggml_offload_nop; -#endif - - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: func = lm_ggml_offload_nop; break; - case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQV: - case OFFLOAD_FUNC_FRC: - case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = lm_ggml_offload_gpu; break; - default: LM_GGML_ASSERT(false); - } - - // apply offload function to the tensor - func(cur); + } -#ifdef LLAMA_OFFLOAD_DEBUG - if (worst_case) { - LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + alloc_inp_K_shift = true; } -#endif }; struct lm_ggml_cgraph * result = NULL; @@ -6172,33 +6152,20 @@ static struct lm_ggml_cgraph * llama_build_graph( { result = llm.build_phi2(); } break; + case LLM_ARCH_PLAMO: + { + result = llm.build_plamo(); + } break; + case LLM_ARCH_GPT2: + { + result = llm.build_gpt2(); + } break; default: - LM_GGML_ASSERT(false); - } - - llm.free(); - - if (worst_case) { - int n_non_view_total = 0; - - for (int i = 0; i < result->n_nodes; ++i) { - if (result->nodes[i]->view_src == nullptr) { - n_non_view_total++; - } - } - - LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); - - if (n_non_view != n_non_view_total) { - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); - LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); - LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); - LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - } + LM_GGML_ASSERT(false); } + llm.free(); + return result; } @@ -6244,8 +6211,6 @@ static int llama_decode_internal( auto & kv_self = lctx.kv_self; - LM_GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -6299,12 +6264,10 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - lm_ggml_allocr_reset(lctx.alloc); + lm_ggml_backend_sched_reset(lctx.sched); lm_ggml_cgraph * gf = llama_build_graph(lctx, batch); - lm_ggml_allocr_alloc_graph(lctx.alloc, gf); - // the output is always the last tensor in the graph struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; LM_GGML_ASSERT(strcmp(res->name, "result_output") == 0); @@ -6316,29 +6279,6 @@ static int llama_decode_internal( LM_GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#ifdef LM_GGML_USE_CUBLAS - for (int i = 0; i < gf->n_leafs; i++) { - lm_ggml_tensor * node = gf->leafs[i]; - if (node->backend == LM_GGML_BACKEND_GPU && node->extra == NULL) { - lm_ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); - lm_ggml_cuda_copy_to_device(node); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - lm_ggml_tensor * node = gf->nodes[i]; - if (node->backend == LM_GGML_BACKEND_GPU && node->extra == NULL) { - lm_ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data); - } - } - - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed - if (!lctx.embedding.empty()) { - embeddings->backend = LM_GGML_BACKEND_CPU; - } - res->backend = LM_GGML_BACKEND_CPU; -#endif - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (lm_ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread @@ -6355,23 +6295,25 @@ static int llama_decode_internal( n_threads = 1; } -#if LM_GGML_USE_MPI +#ifdef LM_GGML_USE_MPI const int64_t n_layer = hparams.n_layer; lm_ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); #endif #ifdef LM_GGML_USE_METAL - if (lctx.ctx_metal) { - lm_ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); - lm_ggml_metal_graph_compute(lctx.ctx_metal, gf); - } else { - lm_ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); + if (lm_ggml_backend_is_metal(lctx.backend_metal)) { + lm_ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); } -#else - lm_ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads); #endif -#if LM_GGML_USE_MPI + if (lctx.backend_cpu != nullptr) { + lm_ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + } + lm_ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched)); + +#ifdef LM_GGML_USE_MPI lm_ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); #endif @@ -6417,30 +6359,33 @@ static int llama_decode_internal( logits_out.clear(); #endif + lm_ggml_backend_t res_backend = lm_ggml_backend_sched_get_node_backend(lctx.sched, res); + LM_GGML_ASSERT(res_backend != nullptr); if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { if (batch.logits[i] == 0) { continue; } - memcpy(logits_out.data() + (n_vocab*i), (float *) lm_ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); + lm_ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - memcpy(logits_out.data(), (float *) lm_ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); + lm_ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) lm_ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); + lm_ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif } + lm_ggml_backend_synchronize(res_backend); } // extract embeddings @@ -6448,7 +6393,9 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) lm_ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd); + lm_ggml_backend_t embeddings_backend = lm_ggml_backend_sched_get_node_backend(lctx.sched, embeddings); + lm_ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + lm_ggml_backend_synchronize(embeddings_backend); } // measure the performance only for the single-token evals @@ -6519,15 +6466,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { static const char * hex = "0123456789ABCDEF"; switch (llama_vocab_get_type(vocab)) { - case LLAMA_VOCAB_TYPE_SPM: { - const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; - return vocab.token_to_id.at(buf); - } - case LLAMA_VOCAB_TYPE_BPE: { - return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); - } - default: - LM_GGML_ASSERT(false); + case LLAMA_VOCAB_TYPE_SPM: { + const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; + return vocab.token_to_id.at(buf); + } + case LLAMA_VOCAB_TYPE_BPE: { + return vocab.token_to_id.at(bytes_to_unicode_bpe(ch)); + } + default: + LM_GGML_ASSERT(false); } } @@ -7061,7 +7008,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break; #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif auto source = std::distance(buffer.begin(), it); @@ -7074,7 +7021,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); + LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif it++; } @@ -7090,7 +7037,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); + LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif it++; @@ -7106,7 +7053,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< raw_text_base_length = right_reminder_length; #ifdef PRETOKENIZERDEBUG - fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); + LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str()); #endif } else { if (source == 0) { @@ -7163,7 +7110,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & } #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_spm tokenizer(vocab); llama_escape_whitespace(raw_text); @@ -7184,7 +7131,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); #ifdef PRETOKENIZERDEBUG - fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); + LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str()); #endif llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); @@ -7637,7 +7584,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } } -void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { +void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) { const int64_t t_start_sample_us = lm_ggml_time_us(); k = std::max(k, (int) min_keep); @@ -7997,7 +7944,7 @@ void llama_sample_classifier_free_guidance( } } -llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { +llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { LM_GGML_ASSERT(ctx); auto N = float(llama_n_vocab(llama_get_model(ctx))); @@ -8406,12 +8353,6 @@ void llama_beam_search(llama_context * ctx, // quantization // -template -struct no_init { - T value; - no_init() { /* do nothing */ } -}; - struct quantize_state_internal { const llama_model & model; const llama_model_quantize_params * params; @@ -8507,9 +8448,23 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = LM_GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = LM_GGML_TYPE_Q5_K; + } else if (new_type != LM_GGML_TYPE_Q8_0) { new_type = LM_GGML_TYPE_Q6_K; } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (name.find("attn_v.weight") != std::string::npos) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = LM_GGML_TYPE_Q4_K; + else new_type = LM_GGML_TYPE_Q2_K; + ++qs.i_attention_wv; + } + else if (name.find("ffn_down") != std::string::npos) { + if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = LM_GGML_TYPE_Q2_K; + ++qs.i_feed_forward_w2; + } + else if (name == "token_embd.weight") new_type = LM_GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -8539,11 +8494,32 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type // TODO: explore better strategies new_type = LM_GGML_TYPE_Q8_0; } - } else if (name.find("ffn_down.weight") != std::string::npos) { + } else if (name.find("ffn_down") != std::string::npos) { + const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + int i_layer, n_layer; + if (n_expert == 1) { + i_layer = qs.i_feed_forward_w2; + n_layer = qs.n_feed_forward_w2; + } else { + // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly + // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work + // for getting the current layer as I initially thought, and we need to resort to parsing the + // tensor name. + n_layer = qs.n_feed_forward_w2 / n_expert; + if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) { + throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str())); + } + if (i_layer < 0 || i_layer >= n_layer) { + throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer)); + } + } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (i_layer < n_layer/8) new_type = LM_GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? LM_GGML_TYPE_Q4_K + new_type = i_layer < n_layer/16 ? LM_GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? LM_GGML_TYPE_Q4_K : LM_GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -8551,22 +8527,29 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { - new_type = qs.i_feed_forward_w2 < 2 ? LM_GGML_TYPE_Q6_K : - use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; + new_type = i_layer < n_layer/16 ? LM_GGML_TYPE_Q6_K : + use_more_bits(i_layer, n_layer) ? LM_GGML_TYPE_Q5_K : LM_GGML_TYPE_Q4_K; } else { - if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; + if (use_more_bits(i_layer, n_layer)) new_type = LM_GGML_TYPE_Q6_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = LM_GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = LM_GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = LM_GGML_TYPE_Q5_K; } ++qs.i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = LM_GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = LM_GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; + if (qs.model.hparams.n_expert == 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + new_type = LM_GGML_TYPE_Q5_K; + } + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = LM_GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = LM_GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q5_K; + } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = LM_GGML_TYPE_Q4_K; } @@ -8576,9 +8559,10 @@ static lm_ggml_type get_k_quant_type(quantize_state_internal & qs, lm_ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = LM_GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = LM_GGML_TYPE_Q6_K; } - else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; - } + // IK: let's remove this, else Q2_K is almost the same as Q3_K_S + //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = LM_GGML_TYPE_Q3_K; + //} // This can be used to reduce the size of the Q5_K_S model. // The associated PPL increase is fully in line with the size reduction //else { @@ -8627,6 +8611,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = LM_GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = LM_GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = LM_GGML_TYPE_Q3_K; break; @@ -8635,6 +8620,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_S: case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = LM_GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = LM_GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = LM_GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = LM_GGML_TYPE_IQ2_XS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -8654,9 +8641,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s #endif llama_model_loader ml(fname_inp, use_mmap, NULL); - if (ml.use_mmap) { - ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, lm_ggml_is_numa())); - } + ml.init_mapping(false); // no prefetching? llama_model model; llm_load_arch(ml, model); @@ -8667,6 +8652,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (params->only_copy) { ftype = model.ftype; } + const std::unordered_map> * imatrix_data = nullptr; + if (params->imatrix) { + imatrix_data = static_cast>*>(params->imatrix); + if (imatrix_data) { + LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + } + } const size_t align = LM_GGUF_DEFAULT_ALIGNMENT; struct lm_gguf_context * ctx_out = lm_gguf_init_empty(); @@ -8685,7 +8677,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; } - else if (name.find("ffn_down.weight") != std::string::npos) { + else if (name.find("ffn_down") != std::string::npos) { ++qs.n_feed_forward_w2; } } @@ -8724,6 +8716,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // placeholder for the meta data ::zeros(fout, meta_size); + std::set used_iq2; + for (int i = 0; i < ml.n_tensors; ++i) { struct lm_ggml_tensor * tensor = ml.get_tensor_meta(i); @@ -8776,6 +8770,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { const size_t nelements = lm_ggml_nelements(tensor); + if ((new_type == LM_GGML_TYPE_IQ2_XXS || new_type == LM_GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) { + lm_ggml_init_iq2_quantization(new_type); + used_iq2.insert(new_type); + } + + const float * imatrix = nullptr; + if (imatrix_data) { + auto it = imatrix_data->find(tensor->name); + if (it == imatrix_data->end()) { + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + } else { + if (it->second.size() == (size_t)tensor->ne[0]) { + imatrix = it->second.data(); + } else { + LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, + int(it->second.size()), int(tensor->ne[0]), tensor->name); + } + } + } + if ((new_type == LM_GGML_TYPE_IQ2_XXS || + new_type == LM_GGML_TYPE_IQ2_XS || + (new_type == LM_GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { + LLAMA_LOG_ERROR("\n\n============================================================\n"); + LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); + LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); + LLAMA_LOG_ERROR("============================================================\n\n"); + throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); + } + float * f32_data; if (tensor->type == LM_GGML_TYPE_F32) { @@ -8796,21 +8819,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_data = work.data(); std::array hist_cur = {}; - static const int chunk_size = 32 * 512; + const int n_per_row = tensor->ne[0]; + const int nrows = nelements / n_per_row; + + static const int min_chunk_size = 32 * 512; + const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); + const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { - new_size = lm_ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + new_size = lm_ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix); } else { - size_t counter = 0; + int counter = 0; new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() { + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, + nrows, n_per_row, imatrix]() { std::array local_hist = {}; + const int nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { std::unique_lock lock(mutex); - size_t first = counter; counter += chunk_size; - if (first >= nelements) { + int first_row = counter; counter += nrows_per_chunk; + if (first_row >= nrows) { if (local_size > 0) { for (int j=0; j %8.2f MiB | hist: ", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", lm_ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; for (size_t i = 0; i < hist_cur.size(); i++) { hist_all[i] += hist_cur[i]; @@ -8840,6 +8871,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if (tot_count > 0) { + LLAMA_LOG_INFO(" | hist: "); for (size_t i = 0; i < hist_cur.size(); i++) { LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); } @@ -8868,6 +8900,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s fout.close(); + for (auto type : used_iq2) { + lm_ggml_deinit_iq2_quantization(type); + } + lm_gguf_free(ctx_out); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); @@ -8925,67 +8961,23 @@ static int llama_apply_lora_from_file_internal( LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - // create a name -> tensor map of the model to accelerate lookups - // find the max tensor size to estimate the required temporary buffer size - size_t max_tensor_size = 0; - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - size_t f32_size = lm_ggml_nelements(kv.second) * sizeof(float); - max_tensor_size = std::max(max_tensor_size, f32_size); - } - - // create a temporary ggml context to store the lora tensors - // TODO: use ggml-alloc - size_t lora_ctx_size = max_tensor_size * 3; - LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); - std::vector lora_buf(lora_ctx_size); - - struct lm_ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - using unique_context = std::unique_ptr; - - unique_context lora_ctx(nullptr, lm_ggml_free); - lora_ctx.reset(lm_ggml_init(params)); - std::unordered_map lora_tensors; - // load base model std::unique_ptr ml; - - unique_context base_ctx(nullptr, lm_ggml_free); - std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL)); - - size_t ctx_size; - size_t mmapped_size; - ml->calc_sizes(ctx_size, mmapped_size); - - base_buf.resize(ctx_size); - - lm_ggml_init_params base_params; - base_params.mem_size = base_buf.size(); - base_params.mem_buffer = base_buf.data(); - base_params.no_alloc = ml->use_mmap; - - base_ctx.reset(lm_ggml_init(base_params)); - - // maybe this should be in llama_model_loader - if (ml->use_mmap) { - ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, lm_ggml_is_numa())); - } + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); + ml->init_mapping(/*prefetch*/ false); // no prefetching } - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; + struct tensor_meta { + std::string name; + lm_ggml_type type; + int32_t ne[2]; + size_t offset; + }; + std::map tensor_meta_map; + // load all tensor meta while (true) { if (fin.tell() == fin.size) { // eof @@ -8998,7 +8990,7 @@ static int llama_apply_lora_from_file_internal( fin.read_raw(&n_dims, sizeof(n_dims)); fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); + fin.read_raw(&ftype, sizeof(ftype)); if (n_dims != 1 && n_dims != 2) { LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); @@ -9012,31 +9004,23 @@ static int llama_apply_lora_from_file_internal( std::string name; { - LM_GGML_ASSERT(name_len <= 1024); - char buf[1024]; + LM_GGML_ASSERT(name_len < LM_GGML_MAX_NAME); + char buf[LM_GGML_MAX_NAME]; fin.read_raw(buf, name_len); name = std::string(buf, name_len); } - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; + // check for lora suffix + std::string lora_suffix; + if (name.length() > 6) { + lora_suffix = name.substr(name.length() - 6); } - - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); + if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { + LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); return 1; } - // create ggml tensor + // tensor type lm_ggml_type wtype; switch (ftype) { case 0: wtype = LM_GGML_TYPE_F32; break; @@ -9048,125 +9032,177 @@ static int llama_apply_lora_from_file_internal( return false; } } - lm_ggml_tensor * lora_tensor = lm_ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); - lm_ggml_set_name(lora_tensor, name.c_str()); - // load tensor data + // data offset size_t offset = fin.tell(); - size_t tensor_data_size = lm_ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seek(offset, SEEK_SET); - fin.read_raw(lora_tensor->data, tensor_data_size); - lora_tensors[name] = lora_tensor; + // skip tensor data + fin.seek(offset + lm_ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + } - lm_ggml_tensor * dest_t = model_tensors[base_name]; + bool warned = false; + int n_tensors = 0; - offload_func_t offload_func = lm_ggml_offload_nop; - offload_func_t offload_func_force_inplace = lm_ggml_offload_nop; + // apply + lm_ggml_backend_t backend_cpu = lm_ggml_backend_cpu_init(); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); + return 1; + } + lm_ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); -#ifdef LM_GGML_USE_CUBLAS - if (dest_t->backend == LM_GGML_BACKEND_GPU || dest_t->backend == LM_GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != LM_GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); - } - offload_func = lm_ggml_cuda_assign_buffers; - offload_func_force_inplace = lm_ggml_cuda_assign_buffers_force_inplace; - } -#endif // LM_GGML_USE_CUBLAS + std::vector> read_buf; + for (const auto & it : model.tensors_by_name) { + const std::string & base_name = it.first; + lm_ggml_tensor * model_t = it.second; - lm_ggml_tensor * base_t; - if (ml) { - struct lm_gguf_context * ctx_gguf = ml->ctx_gguf; + if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || + tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { + continue; + } - // load from base model - if (lm_gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } + tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); + tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, LM_GGML_BACKEND_CPU); - ml->load_data_for(base_t); - } else { - base_t = dest_t; - } + lm_ggml_init_params lora_init_params = { + /* .mem_size */ lm_ggml_tensor_overhead()*128 + lm_ggml_graph_overhead(), + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + lm_ggml_context * lora_ctx = lm_ggml_init(lora_init_params); + if (lora_ctx == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); + lm_ggml_backend_free(backend_cpu); + return 1; + } - if (lm_ggml_is_quantized(base_t->type)) { - if (!warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } + // create tensors + lm_ggml_tensor * loraA = lm_ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); + lm_ggml_tensor * loraB = lm_ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); + lm_ggml_set_name(loraA, metaA.name.c_str()); + lm_ggml_set_name(loraB, metaB.name.c_str()); + + lm_ggml_tensor * base_t; + if (ml) { + if (lm_gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); + return 1; } + base_t = lm_ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); + } else { + base_t = lm_ggml_dup_tensor(lora_ctx, model_t); + } + lm_ggml_set_name(base_t, base_name.c_str()); + + // allocate in backend buffer + lm_ggml_backend_buffer_t lora_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type()); + if (lora_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); + return 1; + } - lm_ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - LM_GGML_ASSERT(loraA->type == LM_GGML_TYPE_F32); - lm_ggml_set_name(loraA, "loraA"); + // load tensor data + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, lm_ggml_tensor * tensor) { + read_buf.resize(lm_ggml_nbytes(tensor)); + fin.seek(tensor_meta.offset, SEEK_SET); + fin.read_raw(read_buf.data(), lm_ggml_nbytes(tensor)); + lm_ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); + }; + load_tensor(metaA, loraA); + load_tensor(metaB, loraB); + + // load base model tensor data + if (ml) { + ml->load_data_for(base_t); + } else { + lm_ggml_backend_tensor_copy(model_t, base_t); + } - lm_ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - LM_GGML_ASSERT(loraB->type == LM_GGML_TYPE_F32); - lm_ggml_set_name(loraB, "loraB"); + if (lm_ggml_is_quantized(base_t->type) && !warned) { + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - return 1; - } + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + lm_ggml_free(lora_ctx); + lm_ggml_backend_buffer_free(lora_buf); + lm_ggml_backend_free(backend_cpu); + return 1; + } + auto build_lora_graph = [&]() { // w = w + BA*s - lm_ggml_tensor * BA = lm_ggml_mul_mat(lora_ctx.get(), loraA, loraB); - offload_func(BA); + lm_ggml_tensor * BA = lm_ggml_mul_mat(lora_ctx, loraA, loraB); lm_ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - lm_ggml_tensor * scale_tensor = lm_ggml_new_f32(lora_ctx.get(), scaling); - lm_ggml_set_name(scale_tensor, "scale_tensor"); - - BA = lm_ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor); - offload_func(BA); + BA = lm_ggml_scale(lora_ctx, BA, scaling); lm_ggml_set_name(BA, "BA_scaled"); } lm_ggml_tensor * r; - if (base_t == dest_t) { - r = lm_ggml_add_inplace(lora_ctx.get(), dest_t, BA); - offload_func_force_inplace(r); - lm_ggml_set_name(r, "r_add_inplace"); - } - else { - r = lm_ggml_add(lora_ctx.get(), base_t, BA); - offload_func(r); - lm_ggml_set_name(r, "r_add"); + r = lm_ggml_add_inplace(lora_ctx, base_t, BA); + lm_ggml_set_name(r, "r_add"); - r = lm_ggml_cpy(lora_ctx.get(), r, dest_t); - offload_func(r); - lm_ggml_set_name(r, "r_cpy"); + if (base_t->type != model_t->type) { + // convert the result to the model type + r = lm_ggml_cast(lora_ctx, r, model_t->type); + lm_ggml_set_name(r, "r_cast"); } - struct lm_ggml_cgraph * gf = lm_ggml_new_graph(lora_ctx.get()); - lm_ggml_build_forward_expand(gf, r); + return r; + }; + + lm_ggml_cgraph * gf = lm_ggml_new_graph(lora_ctx); + lm_ggml_tensor * r = build_lora_graph(); + lm_ggml_build_forward_expand(gf, r); + + lm_ggml_backend_buffer_t graph_buf = lm_ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, lm_ggml_backend_cpu_buffer_type()); + if (graph_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); + lm_ggml_free(lora_ctx); + lm_ggml_backend_buffer_free(lora_buf); + lm_ggml_backend_free(backend_cpu); + return 1; + } - lm_ggml_graph_compute_helper(work_buffer, gf, n_threads); + lm_ggml_backend_graph_compute(backend_cpu, gf); - // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other - LM_GGML_ASSERT(lora_tensors.size() == 2); + lm_ggml_backend_tensor_set(model_t, r->data, 0, lm_ggml_nbytes(r)); - // we won't need these tensors again, reset the context to save memory - lora_ctx.reset(lm_ggml_init(params)); - lora_tensors.clear(); +#if 0 + // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU + //lm_ggml_backend_sched_t sched = lm_ggml_backend_sched_new(backends.data(), backends.size(), LM_GGML_DEFAULT_GRAPH_SIZE); - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); - } + // sched compute + lm_ggml_build_forward_expand(gf, build_graph()); + lm_ggml_backend_sched_init_measure(sched, gf); + + // create the graph again, since the previous one was destroyed by the measure + lm_ggml_graph_clear(gf); + lm_ggml_build_forward_expand(gf, build_graph()); + lm_ggml_backend_sched_graph_compute(sched, gf); + lm_ggml_backend_sched_free(sched); +#endif + + lm_ggml_backend_buffer_free(lora_buf); + lm_ggml_backend_buffer_free(graph_buf); + lm_ggml_free(lora_ctx); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_LOG_INFO("."); } } + lm_ggml_backend_free(backend_cpu); + const int64_t t_lora_us = lm_ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); @@ -9179,6 +9215,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, + /*.split_mode =*/ LLAMA_SPLIT_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -9190,7 +9227,8 @@ struct llama_model_params llama_model_default_params() { }; #ifdef LM_GGML_USE_METAL - result.n_gpu_layers = 1; + // note: we usually have plenty of VRAM, so by default offload all layers to the GPU + result.n_gpu_layers = 999; #endif return result; @@ -9230,12 +9268,13 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, /*.pure =*/ false, + /*.imatrix =*/ nullptr, }; return result; } -int llama_max_devices(void) { +int32_t llama_max_devices(void) { return LLAMA_MAX_DEVICES; } @@ -9296,11 +9335,18 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } + return true; }; } - if (!llama_model_load(path_model, *model, params)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + int status = llama_model_load(path_model, *model, params); + LM_GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } delete model; return nullptr; } @@ -9370,12 +9416,56 @@ struct llama_context * llama_new_context_with_model( const lm_ggml_type type_k = params.type_k; const lm_ggml_type type_v = params.type_v; - LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_k) == 0); - LM_GGML_ASSERT(hparams.n_embd_head() % lm_ggml_blck_size(type_v) == 0); + LM_GGML_ASSERT(hparams.n_embd_head_k % lm_ggml_blck_size(type_k) == 0); + LM_GGML_ASSERT(hparams.n_embd_head_v % lm_ggml_blck_size(type_v) == 0); - // reserve memory for context buffers if (!hparams.vocab_only) { - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + // initialize backends +#ifdef LM_GGML_USE_METAL + if (model->n_gpu_layers > 0) { + ctx->backend_metal = lm_ggml_backend_metal_init(); + if (ctx->backend_metal == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(ctx->backend_metal); + } +#elif defined(LM_GGML_USE_CUBLAS) + if (model->n_gpu_layers > 0) { + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int device = 0; device < lm_ggml_backend_cuda_get_device_count(); ++device) { + lm_ggml_backend_t backend = lm_ggml_backend_cuda_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } + } +#endif + ctx->backend_cpu = lm_ggml_backend_cpu_init(); + if (ctx->backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(ctx->backend_cpu); + + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, + cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9399,24 +9489,30 @@ struct llama_context * llama_new_context_with_model( lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // resized during inference - if (params.logits_all) { - ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab); - } else { - ctx->logits.reserve(hparams.n_vocab); - } + // resized during inference, reserve maximum + ctx->logits.reserve(hparams.n_vocab*cparams.n_batch); if (params.embedding){ ctx->embedding.resize(hparams.n_embd); } { - static const size_t tensor_alignment = 32; - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data - ctx->buf_compute.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead()); + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + for (auto * backend : ctx->backends) { + if (lm_ggml_backend_is_cpu(backend)) { + // use host buffers for the CPU backend compute buffer + backend_buft.push_back(llama_default_buffer_type_cpu(true)); + } else { + backend_buft.push_back(lm_ggml_backend_get_default_buffer_type(backend)); + } + } + + // buffer used to store the computation graph and the tensor meta data + ctx->buf_compute_meta.resize(lm_ggml_tensor_overhead()*LLAMA_MAX_NODES + lm_ggml_graph_overhead()); - // create measure allocator - ctx->alloc = lm_ggml_allocr_new_measure(tensor_alignment); + ctx->sched = lm_ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); + ctx->alloc = lm_ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9424,98 +9520,20 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph lm_ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); -#ifdef LM_GGML_USE_METAL - if (model->n_gpu_layers > 0) { - ctx->ctx_metal = lm_ggml_metal_init(1); - if (!ctx->ctx_metal) { - LLAMA_LOG_ERROR("%s: lm_ggml_metal_init() failed\n", __func__); - llama_free(ctx); - return NULL; - } - //lm_ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); - //lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif - // measure memory requirements for the graph - size_t alloc_size = lm_ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; - - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0); - - // recreate allocator with exact memory requirements - lm_ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc.resize(alloc_size); - ctx->alloc = lm_ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment); -#ifdef LM_GGML_USE_METAL - if (ctx->ctx_metal) { - //lm_ggml_allocr_set_parse_seq(ctx->alloc, lm_ggml_metal_get_concur_list(ctx->ctx_metal), lm_ggml_metal_if_optimized(ctx->ctx_metal)); - } -#endif -#ifdef LM_GGML_USE_CUBLAS - lm_ggml_cuda_set_scratch_size(alloc_size); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - - // calculate total VRAM usage - auto add_tensor = [](const lm_ggml_tensor * t, size_t & size) { - if (t->backend == LM_GGML_BACKEND_GPU || t->backend == LM_GGML_BACKEND_GPU_SPLIT) { - size += lm_ggml_nbytes(t); - } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } - - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; + // initialize scheduler with the worst-case graph + lm_ggml_backend_sched_init_measure(ctx->sched, gf); + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); + ctx->alloc = lm_ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); -#endif - } - -#ifdef LM_GGML_USE_METAL - if (model->n_gpu_layers > 0) { - // this allocates all Metal resources and memory buffers - - void * data_ptr = NULL; - size_t data_size = 0; - - if (ctx->model.mapping) { - data_ptr = ctx->model.mapping->addr; - data_size = ctx->model.mapping->size; - } else { - data_ptr = lm_ggml_get_mem_buffer(ctx->model.ctx); - data_size = lm_ggml_get_mem_size (ctx->model.ctx); + for (lm_ggml_backend_t backend : ctx->backends) { + lm_ggml_backend_buffer_t buf = lm_ggml_backend_sched_get_buffer(ctx->sched, backend); + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + lm_ggml_backend_buffer_name(buf), + lm_ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } - - const size_t max_size = lm_ggml_get_max_tensor_size(ctx->model.ctx); - - LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0); - -#define LLAMA_METAL_CHECK_BUF(result) \ - if (!(result)) { \ - LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ - llama_free(ctx); \ - return NULL; \ - } - - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); - LLAMA_METAL_CHECK_BUF(lm_ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); -#undef LLAMA_METAL_CHECK_BUF } -#endif } #ifdef LM_GGML_USE_MPI @@ -9543,23 +9561,27 @@ const llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } -int llama_n_ctx(const struct llama_context * ctx) { +uint32_t llama_n_ctx(const struct llama_context * ctx) { return ctx->cparams.n_ctx; } +uint32_t llama_n_batch(const struct llama_context * ctx) { + return ctx->cparams.n_batch; +} + enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } -int llama_n_vocab(const struct llama_model * model) { +int32_t llama_n_vocab(const struct llama_model * model) { return model->vocab.id_to_token.size(); } -int llama_n_ctx_train(const struct llama_model * model) { +int32_t llama_n_ctx_train(const struct llama_model * model) { return model->hparams.n_ctx_train; } -int llama_n_embd(const struct llama_model * model) { +int32_t llama_n_embd(const struct llama_model * model) { return model->hparams.n_embd; } @@ -9567,7 +9589,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) { return model->hparams.rope_freq_scale_train; } -int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) { +int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) { const auto & it = model->lm_gguf_kv.find(key); if (it == model->lm_gguf_kv.end()) { if (buf_size > 0) { @@ -9578,11 +9600,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key, return snprintf(buf, buf_size, "%s", it->second.c_str()); } -int llama_model_meta_count(const struct llama_model * model) { +int32_t llama_model_meta_count(const struct llama_model * model) { return (int)model->lm_gguf_kv.size(); } -int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { +int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { if (i < 0 || i >= (int)model->lm_gguf_kv.size()) { if (buf_size > 0) { buf[0] = '\0'; @@ -9594,7 +9616,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char return snprintf(buf, buf_size, "%s", it->first.c_str()); } -int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) { +int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) { if (i < 0 || i >= (int)model->lm_gguf_kv.size()) { if (buf_size > 0) { buf[0] = '\0'; @@ -9606,7 +9628,7 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c return snprintf(buf, buf_size, "%s", it->second.c_str()); } -int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { +int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { return snprintf(buf, buf_size, "%s %s %s", llama_model_arch_name(model->arch).c_str(), llama_model_type_name(model->type), @@ -9630,10 +9652,17 @@ uint64_t llama_model_n_params(const struct llama_model * model) { } struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - return lm_ggml_get_tensor(model->ctx, name); + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; } -int llama_model_quantize( +uint32_t llama_model_quantize( const char * fname_inp, const char * fname_out, const llama_model_quantize_params * params) { @@ -9646,7 +9675,7 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) { +int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { try { return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { @@ -9655,7 +9684,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } } -int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) { +int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { try { return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { @@ -9753,7 +9782,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k } } -int llama_get_kv_cache_token_count(const struct llama_context * ctx) { +int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) { int result = 0; for (uint32_t i = 0; i < ctx->kv_self.size; i++) { @@ -9763,7 +9792,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return result; } -int llama_get_kv_cache_used_cells(const struct llama_context * ctx) { +int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) { return ctx->kv_self.used; } @@ -9787,28 +9816,39 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) { } void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta); } +void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d); +} + // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = LLAMA_MAX_RNG_STATE; - const size_t s_logits_capacity = sizeof(size_t); const size_t s_logits_size = sizeof(size_t); + // assume worst case for logits although only currently set ones are serialized const size_t s_logits = ctx->logits.capacity() * sizeof(float); const size_t s_embedding_size = sizeof(size_t); const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->kv_self.buf.size; + const size_t s_kv = ctx->kv_self.total_size(); const size_t s_total = ( + s_rng_size + s_rng - + s_logits_capacity + s_logits_size + s_logits + s_embedding_size @@ -9877,37 +9917,27 @@ struct llama_data_file_context : llama_data_context { static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { // copy rng { - std::stringstream rng_ss; + std::ostringstream rng_ss; rng_ss << ctx->rng; - const size_t rng_size = rng_ss.str().size(); - char rng_buf[LLAMA_MAX_RNG_STATE]; + const std::string & rng_str = rng_ss.str(); + const size_t rng_size = rng_str.size(); - memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); - memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); + LM_GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - data_ctx->write(&rng_size, sizeof(rng_size)); - data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE); + data_ctx->write(&rng_size, sizeof(rng_size)); + data_ctx->write(rng_str.data(), rng_size); } // copy logits { - const size_t logits_cap = ctx->logits.capacity(); const size_t logits_size = ctx->logits.size(); - data_ctx->write(&logits_cap, sizeof(logits_cap)); data_ctx->write(&logits_size, sizeof(logits_size)); if (logits_size) { data_ctx->write(ctx->logits.data(), logits_size * sizeof(float)); } - - // If there is a gap between the size and the capacity, write padding - size_t padding_size = (logits_cap - logits_size) * sizeof(float); - if (padding_size > 0) { - std::vector padding(padding_size, 0); // Create a buffer filled with zeros - data_ctx->write(padding.data(), padding_size); - } } // copy embeddings @@ -9927,11 +9957,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const auto n_layer = hparams.n_layer; - const auto n_embd = hparams.n_embd_gqa(); - const auto n_ctx = cparams.n_ctx; + const auto n_layer = hparams.n_layer; + const auto n_embd_k_gqa = hparams.n_embd_k_gqa(); + const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); + const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = kv_self.buf.size; + const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -9944,42 +9975,18 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true }); - lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx); - - std::vector> kout2d_data(n_layer); - std::vector> vout2d_data(n_layer); - + std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - lm_ggml_tensor * kout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kout2d_data[il].resize(lm_ggml_nbytes(kout2d)); - kout2d->data = kout2d_data[il].data(); - - lm_ggml_tensor * vout2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vout2d_data[il].resize(lm_ggml_nbytes(vout2d)); - vout2d->data = vout2d_data[il].data(); - - lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd, kv_head, - elt_size*n_embd, 0); - - lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd, - elt_size*n_ctx, 0); - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, k2d, kout2d)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, v2d, vout2d)); - } - - lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); - - lm_ggml_free(cpy_ctx); - - // our data is now in the kout2d_data and vout2d_data buffers - // write them to file - for (uint32_t il = 0; il < n_layer; ++il) { - data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size()); - data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size()); + tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + lm_ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + + // v is not contiguous, copy row by row + tmp_buf.resize(elt_size*kv_head); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + lm_ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + } } } @@ -10013,13 +10020,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set rng { size_t rng_size; - char rng_buf[LLAMA_MAX_RNG_STATE]; + memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; + LM_GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - std::stringstream rng_ss; - rng_ss.str(std::string(&rng_buf[0], rng_size)); + std::string rng_str((char *)inp, rng_size); inp += rng_size; + + std::istringstream rng_ss(rng_str); rng_ss >> ctx->rng; LM_GGML_ASSERT(!rng_ss.fail()); @@ -10027,20 +10034,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set logits { - size_t logits_cap; size_t logits_size; - memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - LM_GGML_ASSERT(ctx->logits.capacity() == logits_cap); + LM_GGML_ASSERT(ctx->logits.capacity() >= logits_size); if (logits_size) { ctx->logits.resize(logits_size); + memcpy(ctx->logits.data(), inp, logits_size * sizeof(float)); + inp += logits_size * sizeof(float); } - - inp += logits_cap * sizeof(float); } // set embeddings @@ -10063,9 +10068,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const auto & hparams = ctx->model.hparams; const auto & cparams = ctx->cparams; - const int n_layer = hparams.n_layer; - const int n_embd = hparams.n_embd_gqa(); - const int n_ctx = cparams.n_ctx; + const int n_layer = hparams.n_layer; + const int n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int n_embd_v_gqa = hparams.n_embd_v_gqa(); + const int n_ctx = cparams.n_ctx; size_t kv_buf_size; uint32_t kv_head; @@ -10078,37 +10084,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - LM_GGML_ASSERT(kv_self.buf.size == kv_buf_size); + LM_GGML_ASSERT(kv_self.total_size() == kv_buf_size); const size_t elt_size = lm_ggml_element_size(kv_self.k_l[0]); - lm_ggml_context * cpy_ctx = lm_ggml_init({ 6*n_layer*lm_ggml_tensor_overhead() + lm_ggml_graph_overhead(), NULL, /* no_alloc */ true }); - lm_ggml_cgraph * gf = lm_ggml_new_graph(cpy_ctx); - - for (int il = 0; il < n_layer; ++il) { - lm_ggml_tensor * kin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head); - kin2d->data = (void *) inp; - inp += lm_ggml_nbytes(kin2d); - - lm_ggml_tensor * vin2d = lm_ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd); - vin2d->data = (void *) inp; - inp += lm_ggml_nbytes(vin2d); - - lm_ggml_tensor * k2d = lm_ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd, kv_head, - elt_size*n_embd, 0); - - lm_ggml_tensor * v2d = lm_ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd, - elt_size*n_ctx, 0); - - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, kin2d, k2d)); - lm_ggml_build_forward_expand(gf, lm_ggml_cpy(cpy_ctx, vin2d, v2d)); + for (int il = 0; il < (int) n_layer; ++il) { + size_t k_size = elt_size*n_embd_k_gqa*kv_head; + lm_ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); + inp += k_size; + + // v is not contiguous, copy row by row + size_t v_row_size = elt_size*kv_head; + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + lm_ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + inp += v_row_size; + } } - - lm_ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); - - lm_ggml_free(cpy_ctx); } ctx->kv_self.head = kv_head; @@ -10229,7 +10220,7 @@ int llama_eval( struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, - int n_past) { + int32_t n_past) { llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0)); @@ -10244,7 +10235,7 @@ int llama_eval_embd( struct llama_context * ctx, float * embd, int32_t n_tokens, - int n_past) { + int32_t n_past) { llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1); llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, }; @@ -10315,7 +10306,7 @@ void llama_batch_free(struct llama_batch batch) { if (batch.logits) free(batch.logits); } -int llama_decode( +int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch) { const int ret = llama_decode_internal(*ctx, batch); @@ -10363,11 +10354,11 @@ llama_token llama_token_nl(const struct llama_model * model) { return model->vocab.linefeed_id; } -int llama_add_bos_token(const struct llama_model * model) { +int32_t llama_add_bos_token(const struct llama_model * model) { return model->vocab.special_add_bos; } -int llama_add_eos_token(const struct llama_model * model) { +int32_t llama_add_eos_token(const struct llama_model * model) { return model->vocab.special_add_eos; } @@ -10387,12 +10378,12 @@ llama_token llama_token_eot(const struct llama_model * model) { return model->vocab.special_eot_id; } -int llama_tokenize( +int32_t llama_tokenize( const struct llama_model * model, const char * text, - int text_len, + int32_t text_len, llama_token * tokens, - int n_max_tokens, + int32_t n_max_tokens, bool add_bos, bool special) { auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); @@ -10420,13 +10411,22 @@ static std::string llama_decode_text(const std::string & text) { } // does not write null-terminator to buf -int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) { +int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_SPM: { + // NOTE: we accept all unsupported token types, + // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; llama_unescape_whitespace(result); + if (length < (int) result.length()) { + return -(int) result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_user_defined_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -result.length(); } @@ -10446,17 +10446,22 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch } buf[0] = llama_token_to_byte(model->vocab, token); return 1; - } else { - // TODO: for now we accept all unsupported token types, - // suppressing them like CONTROL tokens. - // LM_GGML_ASSERT(false); } break; } case LLAMA_VOCAB_TYPE_BPE: { + // NOTE: we accept all unsupported token types, + // suppressing them like CONTROL tokens. if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; result = llama_decode_text(result); + if (length < (int) result.length()) { + return -(int) result.length(); + } + memcpy(buf, result.c_str(), result.length()); + return result.length(); + } else if (llama_is_user_defined_token(model->vocab, token)) { + std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -result.length(); } @@ -10464,10 +10469,6 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch return result.length(); } else if (llama_is_control_token(model->vocab, token)) { ; - } else { - // TODO: for now we accept all unsupported token types, - // suppressing them like CONTROL tokens. - // LM_GGML_ASSERT(false); } break; } @@ -10506,7 +10507,7 @@ void llama_print_timings(struct llama_context * ctx) { __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); } void llama_reset_timings(struct llama_context * ctx) { @@ -10521,6 +10522,7 @@ const char * llama_print_system_info(void) { s = ""; s += "AVX = " + std::to_string(lm_ggml_cpu_has_avx()) + " | "; + s += "AVX_VNNI = " + std::to_string(lm_ggml_cpu_has_avx_vnni()) + " | "; s += "AVX2 = " + std::to_string(lm_ggml_cpu_has_avx2()) + " | "; s += "AVX512 = " + std::to_string(lm_ggml_cpu_has_avx512()) + " | "; s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | "; @@ -10578,7 +10580,7 @@ void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) { g_state.log_callback = log_callback ? log_callback : llama_log_callback_default; g_state.log_callback_user_data = user_data; #ifdef LM_GGML_USE_METAL - lm_ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); + lm_ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); #endif } diff --git a/cpp/llama.h b/cpp/llama.h index d574f5d0..7a9dd0f7 100644 --- a/cpp/llama.h +++ b/cpp/llama.h @@ -43,7 +43,7 @@ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 3 +#define LLAMA_SESSION_VERSION 4 #if defined(LM_GGML_USE_CUBLAS) || defined(LM_GGML_USE_CLBLAST) || defined(LM_GGML_USE_METAL) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. @@ -103,6 +103,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -115,6 +118,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_split_mode { + LLAMA_SPLIT_NONE = 0, // single GPU + LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_ROW = 2, // split rows across GPUs + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -127,7 +136,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -177,10 +186,20 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + enum llama_split_mode split_mode; // how to split the model across multiple GPUs + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored + int32_t main_gpu; - // called with a progress value between 0 and 1, pass NULL to disable + // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + const float * tensor_split; + + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback; // context pointer passed to the progress callback @@ -224,12 +243,13 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // disable k-quant mixtures and quantize all tensors to the same type + void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; // grammar types @@ -308,19 +328,20 @@ extern "C" { LLAMA_API int64_t llama_time_us(void); - LLAMA_API int llama_max_devices (void); + LLAMA_API int32_t llama_max_devices(void); LLAMA_API bool llama_mmap_supported (void); LLAMA_API bool llama_mlock_supported(void); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); - LLAMA_API int llama_n_vocab (const struct llama_model * model); - LLAMA_API int llama_n_ctx_train(const struct llama_model * model); - LLAMA_API int llama_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); + LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); + LLAMA_API int32_t llama_n_embd (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @@ -331,19 +352,19 @@ extern "C" { // - GGUF array values are not supported by these functions // Get metadata value as a string by key name - LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size); + LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size); // Get the number of metadata key/value pairs - LLAMA_API int llama_model_meta_count(const struct llama_model * model); + LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model); // Get metadata key name by index - LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); + LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); // Get metadata value as a string by index - LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size); + LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size); // Get a string describing the model type - LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); + LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size); // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); @@ -355,7 +376,7 @@ extern "C" { LLAMA_API struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); // Returns 0 on success - LLAMA_API int llama_model_quantize( + LLAMA_API uint32_t llama_model_quantize( const char * fname_inp, const char * fname_out, const llama_model_quantize_params * params); @@ -366,20 +387,20 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API DEPRECATED(int llama_apply_lora_from_file( + LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file( struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, - int n_threads), + int32_t n_threads), "use llama_model_apply_lora_from_file instead"); - LLAMA_API int llama_model_apply_lora_from_file( + LLAMA_API int32_t llama_model_apply_lora_from_file( const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, - int n_threads); + int32_t n_threads); // // KV cache @@ -435,10 +456,10 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); + LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx); + LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); // Clear the KV cache LLAMA_API void llama_kv_cache_clear( @@ -481,6 +502,17 @@ extern "C" { llama_pos p1, llama_pos delta); + // Integer division of the positions by factor of `d > 1` + // If the KV cache is RoPEd, the KV data is updated accordingly + // p0 < 0 : [0, p1] + // p1 < 0 : [p0, inf) + LLAMA_API void llama_kv_cache_seq_div( + struct llama_context * ctx, + llama_seq_id seq_id, + llama_pos p0, + llama_pos p1, + int d); + // // State / sessions // @@ -529,7 +561,7 @@ extern "C" { struct llama_context * ctx, llama_token * tokens, int32_t n_tokens, - int n_past), + int32_t n_past), "use llama_decode() instead"); // Same as llama_eval, but use float matrix input directly. @@ -538,7 +570,7 @@ extern "C" { struct llama_context * ctx, float * embd, int32_t n_tokens, - int n_past), + int32_t n_past), "use llama_decode() instead"); // Return batch for single sequence of tokens starting at pos_0 @@ -570,7 +602,7 @@ extern "C" { // 0 - success // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) // < 0 - error - LLAMA_API int llama_decode( + LLAMA_API int32_t llama_decode( struct llama_context * ctx, struct llama_batch batch); @@ -610,10 +642,10 @@ extern "C" { LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int llama_add_bos_token(const struct llama_model * model); + LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model); // Returns -1 if unknown, 1 for true or 0 for false. - LLAMA_API int llama_add_eos_token(const struct llama_model * model); + LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); // codellama infill tokens LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix @@ -631,12 +663,12 @@ extern "C" { /// @return Returns a negative number on failure - the number of tokens that would have been returned /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. /// Does not insert a leading space. - LLAMA_API int llama_tokenize( + LLAMA_API int32_t llama_tokenize( const struct llama_model * model, const char * text, - int text_len, + int32_t text_len, llama_token * tokens, - int n_max_tokens, + int32_t n_max_tokens, bool add_bos, bool special); @@ -644,11 +676,11 @@ extern "C" { // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. - LLAMA_API int llama_token_to_piece( + LLAMA_API int32_t llama_token_to_piece( const struct llama_model * model, llama_token token, char * buf, - int length); + int32_t length); // // Grammar @@ -700,7 +732,7 @@ extern "C" { LLAMA_API void llama_sample_top_k( struct llama_context * ctx, llama_token_data_array * candidates, - int k, + int32_t k, size_t min_keep); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -759,7 +791,7 @@ extern "C" { llama_token_data_array * candidates, float tau, float eta, - int m, + int32_t m, float * mu); /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. @@ -832,8 +864,8 @@ extern "C" { llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, - int n_past, - int n_predict); + int32_t n_past, + int32_t n_predict); // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); diff --git a/cpp/sampling.cpp b/cpp/sampling.cpp index f4e76df3..8e45909f 100644 --- a/cpp/sampling.cpp +++ b/cpp/sampling.cpp @@ -149,11 +149,12 @@ static void sampler_queue( } } -llama_token llama_sampling_sample( +static llama_token llama_sampling_sample_impl( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { + const int idx, + bool is_resampling) { // Add a parameter to indicate if we are resampling const llama_sampling_params & params = ctx_sampling->params; const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); @@ -173,8 +174,17 @@ llama_token llama_sampling_sample( llama_token id = 0; + // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); + // Declare original_logits at the beginning of the function scope + std::vector original_logits; + + if (!is_resampling) { + // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this. + original_logits = std::vector(logits, logits + llama_n_vocab(llama_get_model(ctx_main))); + } + // apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { logits[it->first] += it->second; @@ -193,12 +203,14 @@ llama_token llama_sampling_sample( } // apply penalties - if (!prev.empty()) { + const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); + if (penalty_tokens_used_size) { const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; llama_sample_repetition_penalties(ctx_main, &cur_p, - prev.data() + prev.size() - penalty_last_n, - penalty_last_n, penalty_repeat, penalty_freq, penalty_present); + penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, + penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); if (!penalize_nl) { for (size_t idx = 0; idx < cur_p.size; idx++) { @@ -210,7 +222,8 @@ llama_token llama_sampling_sample( } } - if (ctx_sampling->grammar != NULL) { + // If we are in the resampling phase, apply grammar checks before sampling logic + if (is_resampling && ctx_sampling->grammar != NULL) { llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } @@ -252,9 +265,40 @@ llama_token llama_sampling_sample( } } + if (ctx_sampling->grammar != NULL && !is_resampling) { + // Create an array with a single token data element for the sampled id + llama_token_data single_token_data = {id, logits[id], 0.0f}; + llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; + + // Apply grammar constraints to the single token + llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); + + // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY + bool is_valid = single_token_data_array.data[0].logit != -INFINITY; + + // If the token is not valid according to the grammar, perform resampling + if (!is_valid) { + LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str()); + + // Restore logits from the copy + std::copy(original_logits.begin(), original_logits.end(), logits); + + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling + } + } + return id; } +llama_token llama_sampling_sample( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx) { + // Call the implementation function with is_resampling set to false by default + return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); +} + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, diff --git a/cpp/sampling.h b/cpp/sampling.h index fdfa9eed..f16ef97e 100644 --- a/cpp/sampling.h +++ b/cpp/sampling.h @@ -36,6 +36,9 @@ typedef struct llama_sampling_params { float cfg_scale = 1.f; // how strong is guidance std::unordered_map logit_bias; // logit bias for specific tokens + + std::vector penalty_prompt_tokens; + bool use_penalty_prompt_tokens = false; } llama_sampling_params; // general sampler context diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 21fcdb6c..a6164b73 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -8,7 +8,7 @@ PODS: - hermes-engine/Pre-built (= 0.72.3) - hermes-engine/Pre-built (0.72.3) - libevent (2.1.12) - - llama-rn (0.3.0-rc.8): + - llama-rn (0.3.0-rc.9): - RCT-Folly - RCTRequired - RCTTypeSafety @@ -1261,7 +1261,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: a344d3b9c89f4a0e5667c94ee01e6cdb2d1dd630 + llama-rn: 961b8721218c324430b4cc26b36269fb2cadce60 RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 @@ -1307,4 +1307,4 @@ SPEC CHECKSUMS: PODFILE CHECKSUM: e85f4f0de2f3382c406a75702d5f126a993a8dde -COCOAPODS: 1.11.3 +COCOAPODS: 1.14.3 diff --git a/llama.cpp b/llama.cpp index a7aee47b..a836c8f5 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit a7aee47b98e45539d491071b25778b833b77e387 +Subproject commit a836c8f534ab789b02da149fbdaf7735500bff74 diff --git a/scripts/ggml-metal.m.patch b/scripts/ggml-metal.m.patch index caeaa8ac..9a991a64 100644 --- a/scripts/ggml-metal.m.patch +++ b/scripts/ggml-metal.m.patch @@ -1,6 +1,6 @@ ---- ggml-metal.m.orig 2023-12-19 07:48:34 -+++ ggml-metal.m 2023-12-19 07:48:35 -@@ -265,7 +265,7 @@ +--- ggml-metal.m.orig 2024-01-15 12:36:32 ++++ ggml-metal.m 2024-01-15 12:36:33 +@@ -293,7 +293,7 @@ if (ggmlMetalPathResources) { sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal.metal"]; } else { diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch index 36d3c5d9..04f1e095 100644 --- a/scripts/llama.cpp.patch +++ b/scripts/llama.cpp.patch @@ -1,9 +1,9 @@ ---- llama.cpp.orig 2023-12-19 07:48:34 -+++ llama.cpp 2023-12-19 07:48:35 -@@ -106,6 +106,17 @@ +--- llama.cpp.orig 2024-01-15 12:27:16 ++++ llama.cpp 2024-01-15 12:26:12 +@@ -107,6 +107,17 @@ #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__) - + +#if defined(__ANDROID__) && defined(RNLLAMA_ANDROID_ENABLE_LOGGING) +#include +#define LLAMA_ANDROID_TAG "RNLLAMA_LOG_ANDROID" @@ -18,12 +18,12 @@ // // helpers // -@@ -895,16 +906,16 @@ - +@@ -876,16 +887,16 @@ + if (prefetch > 0) { - // Advise the kernel to preload the mapped memory + // advise the kernel to preload the mapped memory - if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) { -- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", +- LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno)); @@ -33,7 +33,7 @@ // advise the kernel not to use readahead // (because the next page might not belong on the same node) - if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { -- fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", +- LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n", + if (madvise(addr, file->size, MADV_RANDOM)) { + fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", strerror(errno));