diff --git a/include/llm.hpp b/include/llm.hpp index 49f8f144..9ef2491e 100644 --- a/include/llm.hpp +++ b/include/llm.hpp @@ -80,14 +80,24 @@ class Llm { void warmup(); std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); std::string response_nohistory(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr); + void generate_init(); + std::string generate(const std::vector& input_ids, std::ostream* os, const char* end_with); + std::vector generate(const std::vector& input_ids); + int forward(const std::vector& input_ids); float load_progress() { return load_progress_; } void reset(); void print_speed(); friend class Pipeline; public: std::vector history_; + std::string model_name_ = ""; + // config + int max_new_tokens_ = 1024; + int backend_type_ = 0; + int thread_num_ = 4; + bool low_precision_ = true; + bool chatml_ = true; // forward info - int max_seq_len_ = 1024; int prompt_len_ = 0; int gen_seq_len_ = 0; int all_seq_len_ = 0; @@ -95,11 +105,8 @@ class Llm { int64_t prefill_us_ = 0; int64_t decode_us_ = 0; protected: - void response_init(); - std::string response_impl(const std::vector& input_ids, std::ostream* os, const char* end_with); VARP embedding(const std::vector& input_ids); VARP txt_embedding(const std::vector& input_ids); - int forward(const std::vector& input_ids); std::vector tokenizer_encode(const std::string& input_str); std::string decode(int id); protected: @@ -111,7 +118,6 @@ class Llm { int layer_nums_ = 0; int hidden_size_ = 4096; std::vector key_value_shape_ = {}; - std::string model_name_ = ""; std::string disk_embedding_file_ = ""; // gen info float load_progress_ = 0.f; diff --git a/python/mnnllm.cpp b/python/mnnllm.cpp index 1b285ae4..da775340 100644 --- a/python/mnnllm.cpp +++ b/python/mnnllm.cpp @@ -11,6 +11,23 @@ using namespace std; +// macros +#define def_attr(NAME) \ +static PyObject* PyLLM_get_##NAME(LLM *self, void *closure) {\ + return PyLong_FromLong(self->llm->NAME##_);\ +}\ +static int PyLLM_set_##NAME(LLM *self, PyObject *value, void *closure) {\ + if (self->llm) {\ + self->llm->NAME##_ = PyLong_AsLong(value);\ + }\ + return 0;\ +} + +#define register_attr(NAME) \ + {#NAME, (getter)PyLLM_get_##NAME, (setter)PyLLM_set_##NAME, "___"#NAME"__", NULL}, +// end + +// type convert start inline PyObject* string2Object(const std::string& str) { #if PY_MAJOR_VERSION == 2 return PyString_FromString(str.c_str()); @@ -19,6 +36,114 @@ inline PyObject* string2Object(const std::string& str) { #endif } +static inline PyObject* toPyObj(string val) { + return string2Object(val); +} + +static inline PyObject* toPyObj(int val) { + return PyLong_FromLong(val); +} + +template +static PyObject* toPyObj(vector values) { + PyObject* obj = PyList_New(values.size()); + for (int i = 0; i < values.size(); i++) { + PyList_SetItem(obj, i, Func(values[i])); + } + return obj; +} + +/* +static inline PyObject* toPyArray(MNN::Express::VARP var) { + auto info = var->getInfo(); + auto shape = info->dim; + size_t total_length = info->size; + auto var_ptr = const_cast(var->readMap()); + std::vector npy_dims; + for(const auto dim : shape) { + npy_dims.push_back(dim); + } + // auto data = PyArray_SimpleNewFromData(npy_dims.size(), npy_dims.data(), NPY_FLOAT, ptr); + auto ndarray = PyArray_SimpleNew(npy_dims.size(), npy_dims.data(), NPY_FLOAT); + void* npy_ptr = PyArray_DATA((PyArrayObject*)ndarray); + std::memcpy(npy_ptr, var_ptr, total_length * sizeof(float)); + return (PyObject*)ndarray; +} + +static inline PyObject* toPyArray(std::vector vec) { + npy_intp dims[1] = { static_cast(vec.size()) }; + auto ndarray = PyArray_SimpleNew(1, dims, NPY_INT); + void* npy_ptr = PyArray_DATA((PyArrayObject*)ndarray); + std::memcpy(npy_ptr, vec.data(), vec.size() * sizeof(int)); + return (PyObject*)ndarray; +} +*/ + +static inline bool isInt(PyObject* obj) { + return PyLong_Check(obj) +#if PY_MAJOR_VERSION < 3 + || PyInt_Check(obj) +#endif + ; +} + +template +static bool isVec(PyObject* obj) { + if (PyTuple_Check(obj)) { + if (PyTuple_Size(obj) > 0) { + return Func(PyTuple_GetItem(obj, 0)); + } else return true; + } else if (PyList_Check(obj)) { + if (PyList_Size(obj) > 0) { + return Func(PyList_GetItem(obj, 0)); + } else return true; + } + return false; +} + +static inline bool isInts(PyObject* obj) { + return isInt(obj) || isVec(obj); +} + +inline int64_t unpackLong(PyObject* obj) { + int overflow; + long long value = PyLong_AsLongLongAndOverflow(obj, &overflow); + return (int64_t)value; +} + +static inline int toInt(PyObject* obj) { + return static_cast(unpackLong(obj)); +} + +template +static vector toVec(PyObject* obj) { + vector values; + if (PyTuple_Check(obj)) { + size_t size = PyTuple_Size(obj); + values.resize(size); + for (int i = 0; i < size; i++) { + values[i] = Func(PyTuple_GetItem(obj, i)); + } + return values; + } + if (PyList_Check(obj)) { + size_t size = PyList_Size(obj); + values.resize(size); + for (int i = 0; i < size; i++) { + values[i] = Func(PyList_GetItem(obj, i)); + } + return values; + } + values.push_back(Func(obj)); + return values; +} + +static inline std::vector toInts(PyObject* obj) { + if (isInt(obj)) { return { toInt(obj) }; } + return toVec(obj); +} +// type convert end + typedef struct { PyObject_HEAD Llm* llm; @@ -30,17 +155,36 @@ static PyObject* PyLLM_new(struct _typeobject *type, PyObject *args, PyObject *k } static PyObject* Py_str(PyObject *self) { - char str[50]; LLM* llm = (LLM*)self; - sprintf(str, "Llm object: %p", llm->llm); - return Py_BuildValue("s", str); + if (!llm) { + Py_RETURN_NONE; + } + return toPyObj(llm->llm->model_name_); +} + +static PyObject* PyLLM_load(LLM *self, PyObject *args) { + const char* model_dir = NULL; + if (!PyArg_ParseTuple(args, "s", &model_dir)) { + Py_RETURN_NONE; + } + self->llm->load(model_dir); + Py_RETURN_NONE; +} + +static PyObject* PyLLM_generate(LLM *self, PyObject *args) { + PyObject *input_ids = nullptr; + if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) { + Py_RETURN_NONE; + } + auto output_ids = self->llm->generate(toInts(input_ids)); + return toPyObj(output_ids); } static PyObject* PyLLM_response(LLM *self, PyObject *args) { const char* query = NULL; int stream = 0; if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) { - return NULL; + Py_RETURN_NONE; } LlmStreamBuffer buffer(nullptr); std::ostream null_os(&buffer); @@ -49,24 +193,24 @@ static PyObject* PyLLM_response(LLM *self, PyObject *args) { } static PyMethodDef PyLLM_methods[] = { - {"response", (PyCFunction)PyLLM_response, METH_VARARGS, "response without hsitory."}, + {"load", (PyCFunction)PyLLM_load, METH_VARARGS, "load model from `dir`."}, + {"generate", (PyCFunction)PyLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."}, + {"response", (PyCFunction)PyLLM_response, METH_VARARGS, "response `query` without hsitory."}, {NULL} /* Sentinel */ }; - -static PyObject* PyLLM_get_mgl(LLM *self, void *closure) { - return PyLong_FromLong(self->llm->max_seq_len_); -} - -static int PyLLM_set_mgl(LLM *self, PyObject *value, void *closure) { - if (self->llm) { - self->llm->max_seq_len_ = (int)PyLong_AsLong(value); - } - return 0; -} +def_attr(backend_type) +def_attr(thread_num) +def_attr(low_precision) +def_attr(chatml) +def_attr(max_new_tokens) static PyGetSetDef PyLLM_getsetters[] = { - {"max_gen_len", (getter)PyLLM_get_mgl, (setter)PyLLM_set_mgl, "___max_gen_len___", NULL}, + register_attr(backend_type) + register_attr(thread_num) + register_attr(low_precision) + register_attr(chatml) + register_attr(max_new_tokens) {NULL} /* Sentinel */ }; @@ -90,7 +234,7 @@ static PyTypeObject PyLLM = { 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE, /*tp_flags*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ "LLM is mnn-llm's `Llm` python wrapper", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ @@ -111,11 +255,11 @@ static PyTypeObject PyLLM = { PyLLM_new, /* tp_new */ }; -static PyObject *py_load(PyObject *self, PyObject *args) { +static PyObject* py_create(PyObject *self, PyObject *args) { if (!PyTuple_Size(args)) { return NULL; } - const char *model_dir = NULL; + const char* model_dir = NULL; const char* model_type = "auto"; if (!PyArg_ParseTuple(args, "s|s", &model_dir, &model_type)) { return NULL; @@ -125,19 +269,19 @@ static PyObject *py_load(PyObject *self, PyObject *args) { return NULL; } llm->llm = Llm::createLLM(model_dir, model_type); - llm->llm->load(model_dir); + // llm->llm->load(model_dir); return (PyObject*)llm; } static PyMethodDef Methods[] = { - {"load", py_load, METH_VARARGS}, - {NULL, NULL} + {"create", py_create, METH_VARARGS}, + {NULL, NULL} }; static struct PyModuleDef mnnllmModule = { PyModuleDef_HEAD_INIT, "cmnnllm", /*module name*/ - "", /* module documentation, may be NULL */ + "mnnllm cpython module.", /* module documentation, may be NULL */ -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */ Methods }; @@ -147,10 +291,12 @@ static void def(PyObject* m, PyMethodDef* method) { } PyMODINIT_FUNC PyInit_cmnnllm(void) { - PyObject *m = PyModule_Create(&mnnllmModule); if (PyType_Ready(&PyLLM) < 0) { - PyErr_SetString(PyExc_Exception, "init LLM: PyType_Ready PyLLM failed"); + PyErr_SetString(PyExc_Exception, "init LLM: PyType_Ready PyLLM failed."); + return NULL; } + PyObject *m = PyModule_Create(&mnnllmModule); + // _import_array(); PyModule_AddObject(m, "LLM", (PyObject *)&PyLLM); def(m, &Methods[0]); return m; diff --git a/python/mnnllm/__init__.py b/python/mnnllm/__init__.py index 9b00fe14..a82c1546 100644 --- a/python/mnnllm/__init__.py +++ b/python/mnnllm/__init__.py @@ -1 +1,77 @@ -from cmnnllm import * +import cmnnllm as __cmnnllm + +class LLM(__cmnnllm.LLM): + def load(self, model_dir: str): + ''' + load model from model_dir + + Parameters + ---------- + model_dir : model path (split) or model name (single) + + Returns + ------- + None + + Example: + ------- + >>> llm.load('../qwen-1.8b-int4') + ''' + super.load(model_dir) + + def generate(self, input_ids: list): + ''' + generate by input_ids + + Parameters + ---------- + input_ids : input token ids, list of int + + Returns + ------- + output_ids : output token ids, list of int + + Example: + ------- + >>> input_ids = [151644, 872, 198, 108386, 151645, 198, 151644, 77091] + >>> output_ids = qwen.generate(input_ids) + ''' + return super.generate(input_ids) + + def response(self, prompt: str, stream: bool = False): + ''' + response by prompt + + Parameters + ---------- + prompt : input prompt + stream : generate string stream, default is False + + Returns + ------- + res : output string + + Example: + ------- + >>> res = qwen.response('Hello', True) + ''' + return super.response(prompt, stream) + +def create(model_dir: str, model_type: str = 'auto'): + ''' + create LLM instance, type decide by `model_dir` or `model_type` + + Parameters + ---------- + model_dir : model path or model name contain model type + model_type : model type, defult is `auto` + + Returns + ------- + llm : LLM instance + + Example: + ------- + >>> qwen = mnnllm.create('../qwen-1.8b-int4.mnn') + ''' + return __cmnnllm.create(model_dir, model_type) \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 8fc67cfc..d691ca3d 100644 --- a/python/setup.py +++ b/python/setup.py @@ -32,9 +32,10 @@ def make_relative_rpath(path): setup(name='mnnllm', version='0.1', + language='c++', description='mnn-llm python', ext_modules=[module], packages=packages, data_files=lib_files, author='wangzhaode', - author_email='hi@zhaode.wang') \ No newline at end of file + author_email='hi@zhaode.wang') diff --git a/src/llm.cpp b/src/llm.cpp index fb32219e..48acfc05 100644 --- a/src/llm.cpp +++ b/src/llm.cpp @@ -119,7 +119,7 @@ void Llm::chat() { reset(); } -void Llm::response_init() { +void Llm::generate_init() { // init status gen_seq_len_ = 0; all_seq_len_ = 0; @@ -135,7 +135,25 @@ void Llm::response_init() { } } -std::string Llm::response_impl(const std::vector& input_ids, std::ostream* os, const char* end_with) { +std::vector Llm::generate(const std::vector& input_ids) { + generate_init(); + std::vector output_ids; + prompt_len_ = static_cast(input_ids.size()); + // prefill + int token = forward(input_ids); + output_ids.push_back(token); + // decode + while (gen_seq_len_ < max_new_tokens_) { + token = forward({token}); + if (is_stop(token)) { + break; + } + output_ids.push_back(token); + } + return output_ids; +} + +std::string Llm::generate(const std::vector& input_ids, std::ostream* os, const char* end_with) { prompt_len_ = static_cast(input_ids.size()); auto st = std::chrono::system_clock::now(); int token = forward(input_ids); @@ -144,7 +162,7 @@ std::string Llm::response_impl(const std::vector& input_ids, std::ostream* std::string output_str = decode(token); prefill_us_ = std::chrono::duration_cast(et - st).count(); *os << output_str << std::flush; - while (gen_seq_len_ < max_seq_len_) { + while (gen_seq_len_ < max_new_tokens_) { st = std::chrono::system_clock::now(); token = forward({token}); et = std::chrono::system_clock::now(); @@ -165,29 +183,30 @@ std::string Llm::response_impl(const std::vector& input_ids, std::ostream* } std::string Llm::response(const std::string& query, std::ostream* os, const char* end_with) { - response_init(); + generate_init(); if (!end_with) { end_with = "\n"; } // response - auto input_ids = tokenizer(query); + auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query); + // printf("ids = "); for (int id : input_ids) printf("%d, ", id); printf("\n"); if (!history_.empty()) { std::copy(input_ids.begin(), input_ids.end(), std::back_inserter(history_)); input_ids = history_; } else { history_ = input_ids; } - return response_impl(input_ids, os, end_with); + return generate(input_ids, os, end_with); } std::string Llm::response_nohistory(const std::string& query, std::ostream* os, const char* end_with) { - response_init(); + generate_init(); if (!end_with) { end_with = "\n"; } // response - auto input_ids = tokenizer(query); - return response_impl(input_ids, os, end_with); + auto input_ids = chatml_ ? tokenizer(query) : tokenizer_encode(query); + return generate(input_ids, os, end_with); } void Llm::print_speed() { @@ -217,10 +236,11 @@ void Llm::load(const std::string& model_dir) { // init ScheduleConfig config; BackendConfig cpuBackendConfig; - config.type = MNN_FORWARD_CPU; - // config.type = MNN_FORWARD_OPENCL; - config.numThread = 4; - cpuBackendConfig.precision = BackendConfig::Precision_Low; + config.type = static_cast(backend_type_);; + config.numThread = thread_num_; + if (low_precision_) { + cpuBackendConfig.precision = BackendConfig::Precision_Low; + } cpuBackendConfig.memory = BackendConfig::Memory_Low; config.backendConfig = &cpuBackendConfig; runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config)); @@ -385,6 +405,7 @@ VARP Llm::txt_embedding(const std::vector& input_ids) { if (needNewVar(inputs_embeds_, 0, seq_len)) { inputs_embeds_ = _Input({seq_len, 1, hidden_size_}, NCHW); } + size_t size = hidden_size_ * sizeof(int16_t); FILE* file = fopen(disk_embedding_file_.c_str(), "rb"); std::unique_ptr buffer(new int16_t[hidden_size_]);