Skip to content

Commit

Permalink
feat: sync llama.cpp (#95)
Browse files Browse the repository at this point in the history
* feat: sync llama.cpp

* fix(ios): add missing ggml-metal-impl.h
  • Loading branch information
jhen0409 authored Nov 21, 2024
1 parent 0d590a4 commit 276a90a
Show file tree
Hide file tree
Showing 21 changed files with 2,600 additions and 1,768 deletions.
1 change: 1 addition & 0 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(
${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.c
${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
${RNLLAMA_LIB_DIR}/ggml-opt.cpp
${RNLLAMA_LIB_DIR}/ggml-threading.cpp
${RNLLAMA_LIB_DIR}/ggml-quants.c
${RNLLAMA_LIB_DIR}/log.cpp
Expand Down
6 changes: 6 additions & 0 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
llama_free_model(model);
return iparams;
}

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
Expand Down
14 changes: 6 additions & 8 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,18 +466,12 @@ static bool lm_ggml_gallocr_is_own(lm_ggml_gallocr_t galloc, struct lm_ggml_tens
return lm_ggml_gallocr_hash_get(galloc, t)->allocated;
}

static void lm_ggml_gallocr_set_node_offset(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id, size_t offset) {
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->allocated = true;
}

static bool lm_ggml_gallocr_is_allocated(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
return t->data != NULL || lm_ggml_gallocr_hash_get(galloc, t)->allocated;
}

static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id) {
LM_GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);

if (!lm_ggml_gallocr_is_allocated(galloc, node) && !lm_ggml_is_view(node)) {
Expand Down Expand Up @@ -816,7 +810,11 @@ static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml
}

static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
size_t node_size = 0;
if (!node->data && !node->view_src) {
LM_GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
node_size = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
}

Expand Down
14 changes: 8 additions & 6 deletions cpp/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * dat
buf->iface.get_tensor(buf, tensor, data, offset, size);
}

LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

if (size == 0) {
Expand Down Expand Up @@ -689,7 +689,7 @@ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_gg
}

static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
lm_ggml_backend_buffer_t buffer = tensor->buffer;
lm_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (buffer == NULL) {
return -1;
}
Expand Down Expand Up @@ -722,8 +722,6 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBU

// returns the backend that should be used for the node based on the current locations
static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
// TODO: use supports_op to check if the backend supports the op

// assign pre-allocated nodes to their backend
int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) {
Expand All @@ -742,7 +740,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch

if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
LM_GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
}

// graph input
Expand Down Expand Up @@ -886,6 +884,9 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
for (int i = 0; i < graph->n_nodes; i++) {
struct lm_ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node);
if (lm_ggml_is_view_op(node->op)) {
continue;
}
// do not overwrite user assignments
if (*node_backend_id == -1) {
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
Expand Down Expand Up @@ -1538,12 +1539,13 @@ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml

lm_ggml_backend_sched_split_graph(sched, measure_graph);

lm_ggml_backend_sched_synchronize(sched);

if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false;
}

lm_ggml_backend_sched_reset(sched);
lm_ggml_backend_sched_synchronize(sched);

return true;
}
Expand Down
26 changes: 17 additions & 9 deletions cpp/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ extern "C" {
LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);

// "offset" refers to the offset of the tensor data for setting/getting data
// "offset" refers to the offset in tensor->data for setting/getting data
LM_GGML_API void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_memset( struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
Expand Down Expand Up @@ -242,14 +242,20 @@ extern "C" {
lm_ggml_backend_sched_reserve(sched, reserve_graph);
// compute
graph = build_graph(sched);
lm_ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
for (int i = 0; i < 10; ++i) {
lm_ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
}
// if there are graph inputs:
lm_ggml_backend_sched_reset(sched);
lm_ggml_backend_sched_alloc_graph(sched, graph);
lm_ggml_backend_tensor_set(input_tensor, ...);
lm_ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once lm_ggml_free is called)
lm_ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
lm_ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
lm_ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
lm_ggml_backend_sched_graph_compute(sched, graph); // execute the graph
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
// allocate them statically via lm_ggml_backend_alloc_ctx_tensors
}
*/

Expand All @@ -264,7 +270,7 @@ extern "C" {
//
typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);

// Initialize a backend scheduler
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);

Expand All @@ -289,7 +295,9 @@ extern "C" {
LM_GGML_API enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph);
LM_GGML_API void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched);

// Reset all assignments and allocators - must be called before changing the node backends
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
// The correct way to use this API is to discard the deallocated tensors and create new ones.
LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched);

// Set a callback to be called for each resulting node during graph compute
Expand Down
43 changes: 19 additions & 24 deletions cpp/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2369,7 +2369,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) {
// figure out which node we're on
uint current_cpu;
int getcpu_ret = 0;
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
Expand Down Expand Up @@ -12216,11 +12216,16 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
const struct lm_ggml_compute_params * params,
struct lm_ggml_tensor * dst) {

const struct lm_ggml_tensor * src0 = dst->src[0];
const struct lm_ggml_tensor * src0_grad = dst->src[1];
const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
const struct lm_ggml_tensor * src0 = dst->src[0];
const struct lm_ggml_tensor * src0_grad = dst->src[1];
const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
const struct lm_ggml_tensor * adamw_params = dst->src[4];

LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad));
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_m));
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_v));
LM_GGML_ASSERT(lm_ggml_nelements(adamw_params) == 7);

const int ith = params->ith;
const int nth = params->nth;
Expand All @@ -12237,16 +12242,14 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

/* const float gnorm = 1.0f; */
int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
const float alpha = lm_ggml_get_op_params_f32(dst, 2);
const float beta1 = lm_ggml_get_op_params_f32(dst, 3);
const float beta2 = lm_ggml_get_op_params_f32(dst, 4);
const float eps = lm_ggml_get_op_params_f32(dst, 5);
const float wd = lm_ggml_get_op_params_f32(dst, 6);

const float beta1h = alpha/(1.0f - powf(beta1, iter));
const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
const float * adamw_params_ptr = lm_ggml_get_data_f32(adamw_params);
const float alpha = adamw_params_ptr[0];
const float beta1 = adamw_params_ptr[1];
const float beta2 = adamw_params_ptr[2];
const float eps = adamw_params_ptr[3];
const float wd = adamw_params_ptr[4];
const float beta1h = adamw_params_ptr[5];
const float beta2h = adamw_params_ptr[6];

for (int ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
Expand All @@ -12270,17 +12273,9 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
// The weight decay is applied independently of the Adam momenta m and v.
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
// See: https://arxiv.org/pdf/1711.05101v3.pdf
w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
}
}

lm_ggml_barrier(params->threadpool);
if (ith != 0) {
return;
}

iter++;
memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
}

static void lm_ggml_compute_forward_opt_step_adamw(
Expand Down
19 changes: 10 additions & 9 deletions cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ void lm_ggml_hash_set_reset(struct lm_ggml_hash_set * hash_set);
static bool lm_ggml_hash_contains(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);

// returns LM_GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key);

// returns LM_GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
static size_t lm_ggml_hash_insert(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
Expand All @@ -210,7 +210,7 @@ static inline size_t lm_ggml_hash(const struct lm_ggml_tensor * p) {
return (size_t)(uintptr_t)p >> 4;
}

static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key) {
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key) {
size_t h = lm_ggml_hash(key) % hash_set->size;

// linear probing
Expand Down Expand Up @@ -281,13 +281,14 @@ enum lm_ggml_cgraph_eval_order {
};

struct lm_ggml_cgraph {
int size;
int n_nodes;
int n_leafs;

struct lm_ggml_tensor ** nodes;
struct lm_ggml_tensor ** grads;
struct lm_ggml_tensor ** leafs;
int size; // maximum number of nodes/leafs/grads/grad_accs
int n_nodes; // number of nodes currently in use
int n_leafs; // number of leafs currently in use

struct lm_ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
struct lm_ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
struct lm_ggml_tensor ** grad_accs; // accumulators for node gradients
struct lm_ggml_tensor ** leafs; // tensors with constant data

struct lm_ggml_hash_set visited_hash_set;

Expand Down
Loading

0 comments on commit 276a90a

Please sign in to comment.