Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: sync llama.cpp #95

Merged
merged 2 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions android/src/main/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(
${RNLLAMA_LIB_DIR}/ggml-cpu.cpp
${RNLLAMA_LIB_DIR}/ggml-cpu-aarch64.c
${RNLLAMA_LIB_DIR}/ggml-cpu-quants.c
${RNLLAMA_LIB_DIR}/ggml-opt.cpp
${RNLLAMA_LIB_DIR}/ggml-threading.cpp
${RNLLAMA_LIB_DIR}/ggml-quants.c
${RNLLAMA_LIB_DIR}/log.cpp
Expand Down
6 changes: 6 additions & 0 deletions cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams;
}

if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
llama_free_model(model);
return iparams;
}

if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
Expand Down
14 changes: 6 additions & 8 deletions cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,18 +466,12 @@ static bool lm_ggml_gallocr_is_own(lm_ggml_gallocr_t galloc, struct lm_ggml_tens
return lm_ggml_gallocr_hash_get(galloc, t)->allocated;
}

static void lm_ggml_gallocr_set_node_offset(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id, size_t offset) {
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->allocated = true;
}

static bool lm_ggml_gallocr_is_allocated(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * t) {
return t->data != NULL || lm_ggml_gallocr_hash_get(galloc, t)->allocated;
}

static void lm_ggml_gallocr_allocate_node(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, int buffer_id) {
LM_GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, node);

if (!lm_ggml_gallocr_is_allocated(galloc, node) && !lm_ggml_is_view(node)) {
Expand Down Expand Up @@ -816,7 +810,11 @@ static void lm_ggml_gallocr_init_tensor(lm_ggml_gallocr_t galloc, struct lm_ggml
}

static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = (node->data || node->view_src) ? 0 : lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
size_t node_size = 0;
if (!node->data && !node->view_src) {
LM_GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
node_size = lm_ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
}

Expand Down
14 changes: 8 additions & 6 deletions cpp/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * dat
buf->iface.get_tensor(buf, tensor, data, offset, size);
}

LM_GGML_API void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
void lm_ggml_backend_tensor_memset(struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
lm_ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

if (size == 0) {
Expand Down Expand Up @@ -689,7 +689,7 @@ static int lm_ggml_backend_sched_backend_id(lm_ggml_backend_sched_t sched, lm_gg
}

static int lm_ggml_backend_sched_backend_from_buffer(lm_ggml_backend_sched_t sched, const struct lm_ggml_tensor * tensor, const struct lm_ggml_tensor * op) {
lm_ggml_backend_buffer_t buffer = tensor->buffer;
lm_ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
if (buffer == NULL) {
return -1;
}
Expand Down Expand Up @@ -722,8 +722,6 @@ static char causes[LM_GGML_DEFAULT_GRAPH_SIZE*16 + LM_GGML_SCHED_MAX_SPLITS_DEBU

// returns the backend that should be used for the node based on the current locations
static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sched, struct lm_ggml_tensor * tensor) {
// TODO: use supports_op to check if the backend supports the op

// assign pre-allocated nodes to their backend
int cur_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) {
Expand All @@ -742,7 +740,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch

if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
// since the tensor is pre-allocated, it cannot be moved to another backend
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
LM_GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
}

// graph input
Expand Down Expand Up @@ -886,6 +884,9 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
for (int i = 0; i < graph->n_nodes; i++) {
struct lm_ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node);
if (lm_ggml_is_view_op(node->op)) {
continue;
}
// do not overwrite user assignments
if (*node_backend_id == -1) {
*node_backend_id = lm_ggml_backend_sched_backend_id_from_cur(sched, node);
Expand Down Expand Up @@ -1538,12 +1539,13 @@ bool lm_ggml_backend_sched_reserve(lm_ggml_backend_sched_t sched, struct lm_ggml

lm_ggml_backend_sched_split_graph(sched, measure_graph);

lm_ggml_backend_sched_synchronize(sched);

if (!lm_ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false;
}

lm_ggml_backend_sched_reset(sched);
lm_ggml_backend_sched_synchronize(sched);

return true;
}
Expand Down
26 changes: 17 additions & 9 deletions cpp/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ extern "C" {
LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);

// "offset" refers to the offset of the tensor data for setting/getting data
// "offset" refers to the offset in tensor->data for setting/getting data
LM_GGML_API void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
LM_GGML_API void lm_ggml_backend_tensor_memset( struct lm_ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
Expand Down Expand Up @@ -242,14 +242,20 @@ extern "C" {
lm_ggml_backend_sched_reserve(sched, reserve_graph);

// compute
graph = build_graph(sched);
lm_ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
for (int i = 0; i < 10; ++i) {
lm_ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
}

// if there are graph inputs:
lm_ggml_backend_sched_reset(sched);
lm_ggml_backend_sched_alloc_graph(sched, graph);
lm_ggml_backend_tensor_set(input_tensor, ...);
lm_ggml_backend_sched_graph_compute(sched, graph);
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once lm_ggml_free is called)
lm_ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
lm_ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
lm_ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
lm_ggml_backend_sched_graph_compute(sched, graph); // execute the graph

// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
// allocate them statically via lm_ggml_backend_alloc_ctx_tensors
}
*/

Expand All @@ -264,7 +270,7 @@ extern "C" {
//
typedef bool (*lm_ggml_backend_sched_eval_callback)(struct lm_ggml_tensor * t, bool ask, void * user_data);

// Initialize a backend scheduler
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
LM_GGML_API lm_ggml_backend_sched_t lm_ggml_backend_sched_new(lm_ggml_backend_t * backends, lm_ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
LM_GGML_API void lm_ggml_backend_sched_free(lm_ggml_backend_sched_t sched);

Expand All @@ -289,7 +295,9 @@ extern "C" {
LM_GGML_API enum lm_ggml_status lm_ggml_backend_sched_graph_compute_async(lm_ggml_backend_sched_t sched, struct lm_ggml_cgraph * graph);
LM_GGML_API void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched);

// Reset all assignments and allocators - must be called before changing the node backends
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
// The correct way to use this API is to discard the deallocated tensors and create new ones.
LM_GGML_API void lm_ggml_backend_sched_reset(lm_ggml_backend_sched_t sched);

// Set a callback to be called for each resulting node during graph compute
Expand Down
43 changes: 19 additions & 24 deletions cpp/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2369,7 +2369,7 @@ void lm_ggml_numa_init(enum lm_ggml_numa_strategy numa_flag) {
// figure out which node we're on
uint current_cpu;
int getcpu_ret = 0;
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
#else
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
Expand Down Expand Up @@ -12216,11 +12216,16 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
const struct lm_ggml_compute_params * params,
struct lm_ggml_tensor * dst) {

const struct lm_ggml_tensor * src0 = dst->src[0];
const struct lm_ggml_tensor * src0_grad = dst->src[1];
const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
const struct lm_ggml_tensor * src0 = dst->src[0];
const struct lm_ggml_tensor * src0_grad = dst->src[1];
const struct lm_ggml_tensor * src0_grad_m = dst->src[2];
const struct lm_ggml_tensor * src0_grad_v = dst->src[3];
const struct lm_ggml_tensor * adamw_params = dst->src[4];

LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad));
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_m));
LM_GGML_ASSERT(lm_ggml_are_same_shape(src0, src0_grad_v));
LM_GGML_ASSERT(lm_ggml_nelements(adamw_params) == 7);

const int ith = params->ith;
const int nth = params->nth;
Expand All @@ -12237,16 +12242,14 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);

/* const float gnorm = 1.0f; */
int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
const float alpha = lm_ggml_get_op_params_f32(dst, 2);
const float beta1 = lm_ggml_get_op_params_f32(dst, 3);
const float beta2 = lm_ggml_get_op_params_f32(dst, 4);
const float eps = lm_ggml_get_op_params_f32(dst, 5);
const float wd = lm_ggml_get_op_params_f32(dst, 6);

const float beta1h = alpha/(1.0f - powf(beta1, iter));
const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
const float * adamw_params_ptr = lm_ggml_get_data_f32(adamw_params);
const float alpha = adamw_params_ptr[0];
const float beta1 = adamw_params_ptr[1];
const float beta2 = adamw_params_ptr[2];
const float eps = adamw_params_ptr[3];
const float wd = adamw_params_ptr[4];
const float beta1h = adamw_params_ptr[5];
const float beta2h = adamw_params_ptr[6];

for (int ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
Expand All @@ -12270,17 +12273,9 @@ static void lm_ggml_compute_forward_opt_step_adamw_f32(
// The weight decay is applied independently of the Adam momenta m and v.
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
// See: https://arxiv.org/pdf/1711.05101v3.pdf
w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
}
}

lm_ggml_barrier(params->threadpool);
if (ith != 0) {
return;
}

iter++;
memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
}

static void lm_ggml_compute_forward_opt_step_adamw(
Expand Down
19 changes: 10 additions & 9 deletions cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ void lm_ggml_hash_set_reset(struct lm_ggml_hash_set * hash_set);
static bool lm_ggml_hash_contains(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);

// returns LM_GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key);

// returns LM_GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
static size_t lm_ggml_hash_insert(struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key);
Expand All @@ -210,7 +210,7 @@ static inline size_t lm_ggml_hash(const struct lm_ggml_tensor * p) {
return (size_t)(uintptr_t)p >> 4;
}

static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, struct lm_ggml_tensor * key) {
static size_t lm_ggml_hash_find(const struct lm_ggml_hash_set * hash_set, const struct lm_ggml_tensor * key) {
size_t h = lm_ggml_hash(key) % hash_set->size;

// linear probing
Expand Down Expand Up @@ -281,13 +281,14 @@ enum lm_ggml_cgraph_eval_order {
};

struct lm_ggml_cgraph {
int size;
int n_nodes;
int n_leafs;

struct lm_ggml_tensor ** nodes;
struct lm_ggml_tensor ** grads;
struct lm_ggml_tensor ** leafs;
int size; // maximum number of nodes/leafs/grads/grad_accs
int n_nodes; // number of nodes currently in use
int n_leafs; // number of leafs currently in use

struct lm_ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
struct lm_ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
struct lm_ggml_tensor ** grad_accs; // accumulators for node gradients
struct lm_ggml_tensor ** leafs; // tensors with constant data

struct lm_ggml_hash_set visited_hash_set;

Expand Down
Loading
Loading