mybigday · jhen0409 · Nov 21, 2024 · Nov 21, 2024 · Nov 21, 2024
diff --git a/android/src/main/CMakeLists.txt b/android/src/main/CMakeLists.txt
@@ -18,6 +18,7 @@ set(
     ${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
     ${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.c
     ${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
+    ${RNLLAMA_LIB_DIR}/ggml-opt.cpp
     ${RNLLAMA_LIB_DIR}/ggml-threading.cpp
     ${RNLLAMA_LIB_DIR}/ggml-quants.c
     ${RNLLAMA_LIB_DIR}/log.cpp

diff --git a/cpp/common.cpp b/cpp/common.cpp
@@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) {
         return iparams;
     }
 
+    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
+        llama_free_model(model);
+        return iparams;
+    }
+
     if (!params.control_vectors.empty()) {
         if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
         if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);

diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c
@@ -466,18 +466,12 @@ static bool lm_ggml_gallocr_is_own(lm_ggml_gallocr_t galloc, struct lm_ggml_tens
     return lm_ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
-static void lm_ggml_gallocr_set_node_offset(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id, size_t offset) {
-    struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);
-    hn->buffer_id = buffer_id;
-    hn->offset = offset;
-    hn->allocated = true;
-}
-
 static bool lm_ggml_gallocr_is_allocated(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
     return t->data != NULL || lm_ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 
 static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id) {
+    LM_GGML_ASSERT(buffer_id >= 0);
     struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);
 
     if (!lm_ggml_gallocr_is_allocated(galloc, node) && !lm_ggml_is_view(node)) {
@@ -816,7 +810,11 @@ static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml
 }
 
 static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    size_t node_size = 0;
+    if (!node->data && !node->view_src) {
+        LM_GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
+        node_size = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    }
     return talloc->size_max >= node_size;
 }
 

diff --git a/cpp/ggml-backend.cpp b/cpp/ggml-backend.cpp
@@ -279,7 +279,7 @@ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * dat
     buf->iface.get_tensor(buf, tensor, data, offset, size);
 }
 
-LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
     lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     if (size == 0) {
@@ -689,7 +689,7 @@ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_gg
 }
 
 static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
-    lm_ggml_backend_buffer_t buffer = tensor->buffer;
+    lm_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
     if (buffer == NULL) {
         return -1;
     }
@@ -722,8 +722,6 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBU
 
 // returns the backend that should be used for the node based on the current locations
 static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
-    // TODO: use supports_op to check if the backend supports the op
-
     // assign pre-allocated nodes to their backend
     int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
     if (cur_backend_id != -1) {
@@ -742,7 +740,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
 
     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
         // since the tensor is pre-allocated, it cannot be moved to another backend
-        LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
+        LM_GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
     }
 
     // graph input
@@ -886,6 +884,9 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
     for (int i = 0; i < graph->n_nodes; i++) {
         struct lm_ggml_tensor * node = graph->nodes[i];
         int * node_backend_id = &tensor_backend_id(node);
+        if (lm_ggml_is_view_op(node->op)) {
+            continue;
+        }
         // do not overwrite user assignments
         if (*node_backend_id == -1) {
             *node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
@@ -1538,12 +1539,13 @@ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml
 
     lm_ggml_backend_sched_split_graph(sched, measure_graph);
 
+    lm_ggml_backend_sched_synchronize(sched);
+
     if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
         return false;
     }
 
     lm_ggml_backend_sched_reset(sched);
-    lm_ggml_backend_sched_synchronize(sched);
 
     return true;
 }

diff --git a/cpp/ggml-backend.h b/cpp/ggml-backend.h
@@ -86,7 +86,7 @@ extern "C" {
     LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend,       struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    // "offset" refers to the offset in tensor->data for setting/getting data
     LM_GGML_API void lm_ggml_backend_tensor_set(      struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor,       void * data, size_t offset, size_t size);
     LM_GGML_API void lm_ggml_backend_tensor_memset(   struct lm_ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@@ -242,14 +242,20 @@ extern "C" {
         lm_ggml_backend_sched_reserve(sched, reserve_graph);
 
         // compute
-        graph = build_graph(sched);
-        lm_ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            lm_ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
 
         // if there are graph inputs:
-        lm_ggml_backend_sched_reset(sched);
-        lm_ggml_backend_sched_alloc_graph(sched, graph);
-        lm_ggml_backend_tensor_set(input_tensor, ...);
-        lm_ggml_backend_sched_graph_compute(sched, graph);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once lm_ggml_free is called)
+        lm_ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        lm_ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        lm_ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        lm_ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via lm_ggml_backend_alloc_ctx_tensors
     }
     */
 
@@ -264,7 +270,7 @@ extern "C" {
     //
     typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);
 
-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
     LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
     LM_GGML_API void                 lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);
 
@@ -289,7 +295,9 @@ extern "C" {
     LM_GGML_API enum lm_ggml_status     lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph);
     LM_GGML_API void                 lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched);
 
-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
     LM_GGML_API void                 lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched);
 
     // Set a callback to be called for each resulting node during graph compute

diff --git a/cpp/ggml-cpu.c b/cpp/ggml-cpu.c
@@ -2369,7 +2369,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) {
     // figure out which node we're on
     uint current_cpu;
     int getcpu_ret = 0;
-#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
     getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
     // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
@@ -12216,11 +12216,16 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
         const struct lm_ggml_compute_params * params,
         struct lm_ggml_tensor * dst) {
 
-    const struct lm_ggml_tensor * src0        = dst->src[0];
-    const struct lm_ggml_tensor * src0_grad   = dst->src[1];
-    const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
-    const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
+    const struct lm_ggml_tensor * src0         = dst->src[0];
+    const struct lm_ggml_tensor * src0_grad    = dst->src[1];
+    const struct lm_ggml_tensor * src0_grad_m  = dst->src[2];
+    const struct lm_ggml_tensor * src0_grad_v  = dst->src[3];
+    const struct lm_ggml_tensor * adamw_params = dst->src[4];
+
     LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad));
+    LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_m));
+    LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_v));
+    LM_GGML_ASSERT(lm_ggml_nelements(adamw_params) == 7);
 
     const int ith = params->ith;
     const int nth = params->nth;
@@ -12237,16 +12242,14 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
-    /* const float   gnorm = 1.0f; */
-    int64_t       iter;   memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
-    const float   alpha = lm_ggml_get_op_params_f32(dst, 2);
-    const float   beta1 = lm_ggml_get_op_params_f32(dst, 3);
-    const float   beta2 = lm_ggml_get_op_params_f32(dst, 4);
-    const float   eps   = lm_ggml_get_op_params_f32(dst, 5);
-    const float   wd    = lm_ggml_get_op_params_f32(dst, 6);
-
-    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
-    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
+    const float * adamw_params_ptr = lm_ggml_get_data_f32(adamw_params);
+    const float alpha  = adamw_params_ptr[0];
+    const float beta1  = adamw_params_ptr[1];
+    const float beta2  = adamw_params_ptr[2];
+    const float eps    = adamw_params_ptr[3];
+    const float wd     = adamw_params_ptr[4];
+    const float beta1h = adamw_params_ptr[5];
+    const float beta2h = adamw_params_ptr[6];
 
     for (int ir = ir0; ir < ir1; ++ir) {
         const int64_t i03 = ir/(ne02*ne01);
@@ -12270,17 +12273,9 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
             // The weight decay is applied independently of the Adam momenta m and v.
             // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
             // See: https://arxiv.org/pdf/1711.05101v3.pdf
-            w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
+            w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
         }
     }
-
-    lm_ggml_barrier(params->threadpool);
-    if (ith != 0) {
-        return;
-    }
-
-    iter++;
-    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
 }
 
 static void lm_ggml_compute_forward_opt_step_adamw(

diff --git a/cpp/ggml-impl.h b/cpp/ggml-impl.h
@@ -196,7 +196,7 @@ void lm_ggml_hash_set_reset(struct lm_ggml_hash_set * hash_set);
 static bool lm_ggml_hash_contains(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
 
 // returns LM_GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
-static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
+static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key);
 
 // returns LM_GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 static size_t lm_ggml_hash_insert(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
@@ -210,7 +210,7 @@ static inline size_t lm_ggml_hash(const struct lm_ggml_tensor * p) {
     return (size_t)(uintptr_t)p >> 4;
 }
 
-static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key) {
+static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key) {
     size_t h = lm_ggml_hash(key) % hash_set->size;
 
     // linear probing
@@ -281,13 +281,14 @@ enum lm_ggml_cgraph_eval_order {
 };
 
 struct lm_ggml_cgraph {
-    int size;
-    int n_nodes;
-    int n_leafs;
-
-    struct lm_ggml_tensor ** nodes;
-    struct lm_ggml_tensor ** grads;
-    struct lm_ggml_tensor ** leafs;
+    int size;    // maximum number of nodes/leafs/grads/grad_accs
+    int n_nodes; // number of nodes currently in use
+    int n_leafs; // number of leafs currently in use
+
+    struct lm_ggml_tensor ** nodes;     // tensors with data that can change if the graph is evaluated
+    struct lm_ggml_tensor ** grads;     // the outputs of these tensors are the gradients of the nodes
+    struct lm_ggml_tensor ** grad_accs; // accumulators for node gradients
+    struct lm_ggml_tensor ** leafs;     // tensors with constant data
 
     struct lm_ggml_hash_set visited_hash_set;