diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c165b281..ae4f8b8d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: - name: Run unit tests run: yarn test --maxWorkers=2 --coverage - build-ios: + build-ios-from-source: runs-on: macos-latest steps: - name: Checkout @@ -69,7 +69,55 @@ jobs: run: | yarn build:ios - build-android: + build-ios-frameworks: + runs-on: macos-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup + uses: ./.github/actions/setup + + - name: Cache build & pods + uses: actions/cache@v3 + with: + path: | + example/ios/Pods + example/ios/build + key: ${{ runner.os }}-pods-${{ hashFiles('example/ios/Podfile.lock') }} + restore-keys: | + ${{ runner.os }}-pods- + + - name: Upgrade CocoaPods to version 1.15.2 + run: | + gem uninstall cocoapods --ignore-dependencies + gem install cocoapods -v 1.15.2 + + - name: Install cocoapods + run: | + yarn example pods + env: + NO_FLIPPER: 1 + RNLLAMA_BUILD_FROM_SOURCE: 0 + + - name: Build example for iOS + run: | + yarn build:ios + + build-android-from-source: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup + uses: ./.github/actions/setup + + - name: Build example for Android + run: | + yarn build:android + + build-android-libs: runs-on: ubuntu-latest steps: - name: Checkout @@ -80,4 +128,5 @@ jobs: - name: Build example for Android run: | + sed -i 's/rnllamaBuildFromSource=true/rnllamaBuildFromSource=false/g' example/android/gradle.properties yarn build:android diff --git a/.gitignore b/.gitignore index c5eda256..37724765 100644 --- a/.gitignore +++ b/.gitignore @@ -73,3 +73,11 @@ docs/API/.nojekyll *.metallib .xocde.env.local + +build-arm64/ +build-x86_64/ +jniLibs/ +build-ios/ +build-tvos/ + +ios/rnllama.xcframework/**/*.framework diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7716e3ac..bbdfb61d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,22 +17,6 @@ yarn bootstrap While developing, you can run the [example app](/example/) to test your changes. Any changes you make in your library's JavaScript code will be reflected in the example app without a rebuild. If you change any native code, then you'll need to rebuild the example app. -To start the packager: - -```sh -yarn example start -``` - -To run the example app on iOS: - -```sh -yarn example pods -yarn example ios -``` - -For test better performance on completion, you can run the app in Release mode: -- iOS: `yarn example ios --mode Release` - Make sure your code passes TypeScript and ESLint. Run the following to verify: ```sh diff --git a/README.md b/README.md index 2d293206..d0f1f100 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ npm install llama.rn Please re-run `npx pod-install` again. +By default, `llama.rn` will use pre-built `rnllama.xcframework` for iOS. If you want to build from source, please set `RNLLAMA_BUILD_FROM_SOURCE` to `1` in your Podfile. + #### Android Add proguard rule if it's enabled in project (android/app/proguard-rules.pro): @@ -27,6 +29,8 @@ Add proguard rule if it's enabled in project (android/app/proguard-rules.pro): -keep class com.rnllama.** { *; } ``` +By default, `llama.rn` will use pre-built libraries for Android. If you want to build from source, please set `rnllamaBuildFromSource` to `true` in `android/gradle.properties`. + ## Obtain the model You can search HuggingFace for available models (Keyword: [`GGUF`](https://huggingface.co/search/full-text?q=GGUF&type=model)). @@ -118,29 +122,6 @@ Please visit the [Documentation](docs/API) for more details. You can also visit the [example](example) to see how to use it. -Run the example: - -```bash -yarn && yarn bootstrap - -# iOS -yarn example ios -# Use device -yarn example ios --device "" -# With release mode -yarn example ios --mode Release - -# Android -yarn example android -# With release mode -yarn example android --mode release -``` - -This example used [react-native-document-picker](https://github.com/rnmods/react-native-document-picker) for select model. - -- iOS: You can move the model to iOS Simulator, or iCloud for real device. -- Android: Selected file will be copied or downloaded to cache directory so it may be slow. - ## Grammar Sampling GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. diff --git a/android/build.gradle b/android/build.gradle index 7cb93616..fae5d670 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -54,9 +54,18 @@ android { } } } - externalNativeBuild { - cmake { - path = file('src/main/CMakeLists.txt') + def rnllamaBuildFromSource = project.properties["rnllamaBuildFromSource"] + if (rnllamaBuildFromSource == "true") { + externalNativeBuild { + cmake { + path = file('src/main/CMakeLists.txt') + } + } + // Exclude jniLibs + sourceSets { + main { + jniLibs.srcDirs = [] + } } } buildTypes { diff --git a/android/src/main/CMakeLists.txt b/android/src/main/CMakeLists.txt index f8739d68..2d1fc782 100644 --- a/android/src/main/CMakeLists.txt +++ b/android/src/main/CMakeLists.txt @@ -45,9 +45,9 @@ set( ${RNLLAMA_LIB_DIR}/unicode.cpp ${RNLLAMA_LIB_DIR}/sgemm.cpp ${RNLLAMA_LIB_DIR}/common.cpp - ${RNLLAMA_LIB_DIR}/rn-llama.hpp ${RNLLAMA_LIB_DIR}/amx/amx.cpp ${RNLLAMA_LIB_DIR}/amx/mmq.cpp + ${RNLLAMA_LIB_DIR}/rn-llama.cpp ${CMAKE_SOURCE_DIR}/jni-utils.h ${CMAKE_SOURCE_DIR}/jni.cpp ) diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index ddcc1f66..72a008ea 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -11,7 +11,7 @@ #include "llama.h" #include "llama-impl.h" #include "ggml.h" -#include "rn-llama.hpp" +#include "rn-llama.h" #include "jni-utils.h" #define UNUSED(x) (void)(x) diff --git a/cpp/README.md b/cpp/README.md index 454426b0..84098c08 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -1,4 +1,4 @@ # Note -- Only `rn-llama.hpp` is the specific file for this project, others are sync from [llama.cpp](https://github.com/ggerganov/llama.cpp). +- Only `rn-llama.h` and `rn-llama.cpp` are the specific files for this folder, others are sync from [llama.cpp](https://github.com/ggerganov/llama.cpp). - We can update the native source by using the [bootstrap](../scripts/bootstrap.sh) script. diff --git a/cpp/rn-llama.cpp b/cpp/rn-llama.cpp new file mode 100644 index 00000000..24da3f17 --- /dev/null +++ b/cpp/rn-llama.cpp @@ -0,0 +1,658 @@ +#include "rn-llama.h" + +namespace rnllama { + +const std::vector kv_cache_types = { + LM_GGML_TYPE_F32, + LM_GGML_TYPE_F16, + LM_GGML_TYPE_BF16, + LM_GGML_TYPE_Q8_0, + LM_GGML_TYPE_Q4_0, + LM_GGML_TYPE_Q4_1, + LM_GGML_TYPE_IQ4_NL, + LM_GGML_TYPE_Q5_0, + LM_GGML_TYPE_Q5_1, +}; + +lm_ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (lm_ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static void llama_batch_clear(llama_batch *batch) { + batch->n_tokens = 0; +} + +static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector seq_ids, bool logits) { + batch->token [batch->n_tokens] = id; + batch->pos [batch->n_tokens] = pos; + batch->n_seq_id[batch->n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); i++) { + batch->seq_id[batch->n_tokens][i] = seq_ids[i]; + } + batch->logits [batch->n_tokens] = logits ? 1 : 0; + batch->n_tokens += 1; +} + +// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp + +static void log(const char *level, const char *function, int line, + const char *format, ...) +{ + va_list args; + #if defined(__ANDROID__) + char prefix[256]; + snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format); + + va_start(args, format); + android_LogPriority priority; + if (strcmp(level, "ERROR") == 0) { + priority = ANDROID_LOG_ERROR; + } else if (strcmp(level, "WARNING") == 0) { + priority = ANDROID_LOG_WARN; + } else if (strcmp(level, "INFO") == 0) { + priority = ANDROID_LOG_INFO; + } else { + priority = ANDROID_LOG_DEBUG; + } + __android_log_vprint(priority, "RNLlama", prefix, args); + va_end(args); + #else + printf("[%s] %s:%d ", level, function, line); + va_start(args, format); + vprintf(format, args); + va_end(args); + printf("\n"); + #endif +} + +#if RNLLAMA_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (rnllama_verbose) \ + { \ + log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__) +#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__) +#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__) + +static size_t common_part(const std::vector &a, const std::vector &b) +{ + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) + { + } + return i; +} + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +// format incomplete utf-8 multibyte character for output +std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) +{ + std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) + { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; + } + return out; +} + +std::string tokens_to_str(llama_context *ctx, const std::vector::const_iterator begin, const std::vector::const_iterator end) +{ + std::string ret; + for (auto it = begin; it != end; ++it) + { + ret += common_token_to_piece(ctx, *it); + } + return ret; +} + +llama_rn_context::~llama_rn_context() { + if (ctx_sampling != nullptr) { + common_sampler_free(ctx_sampling); + } +} + +void llama_rn_context::rewind() { + is_interrupted = false; + params.antiprompt.clear(); + params.sampling.grammar.clear(); + num_prompt_tokens = 0; + num_tokens_predicted = 0; + generated_text = ""; + generated_text.reserve(params.n_ctx); + generated_token_probs.clear(); + truncated = false; + stopped_eos = false; + stopped_word = false; + stopped_limit = false; + stopping_word = ""; + incomplete = false; + n_remain = 0; + n_past = 0; + params.sampling.n_prev = n_ctx; +} + +bool llama_rn_context::initSampling() { + if (ctx_sampling != nullptr) { + common_sampler_free(ctx_sampling); + } + ctx_sampling = common_sampler_init(model, params.sampling); + return ctx_sampling != nullptr; +} + +bool llama_rn_context::loadModel(common_params ¶ms_) +{ + params = params_; + llama_init = common_init_from_params(params); + model = llama_init.model.get(); + ctx = llama_init.context.get(); + if (model == nullptr) + { + LOG_ERROR("unable to load model: %s", params_.model.c_str()); + return false; + } + n_ctx = llama_n_ctx(ctx); + + // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101 + // LOG_INFO("%s\n", common_params_get_system_info(params).c_str()); + + return true; +} + +bool llama_rn_context::validateModelChatTemplate() const { + const char * tmpl = llama_model_chat_template(model); + llama_chat_message chat[] = {{"user", "test"}}; + int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); + return chat_res > 0; +} + +void llama_rn_context::truncatePrompt(std::vector &prompt_tokens) { + const int n_left = n_ctx - params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size; + + // Keep n_keep tokens at start of prompt (at most n_ctx - 4) + std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); + + new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + + LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d", + n_ctx, + params.n_keep, + n_left, + tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(), + new_tokens.size() + ); + + truncated = true; + prompt_tokens = new_tokens; +} + +void llama_rn_context::loadPrompt() { + std::vector prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true); + num_prompt_tokens = prompt_tokens.size(); + + // LOG tokens + std::stringstream ss; + ss << "\n" << __func__ << ": prompt_tokens = "; + for (auto& token : prompt_tokens) { + ss << token << " "; + } + LOG_INFO("%s\n", ss.str().c_str()); + + if (params.n_keep < 0) + { + params.n_keep = (int)num_prompt_tokens; + } + params.n_keep = std::min(n_ctx - 4, params.n_keep); + + // if input prompt is too big, truncate like normal + if (num_prompt_tokens >= (size_t) n_ctx) + { + truncatePrompt(prompt_tokens); + num_prompt_tokens = prompt_tokens.size(); + + LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx); + } + // push the prompt into the sampling context (do not apply grammar) + for (auto & token : prompt_tokens) + { + common_sampler_accept(ctx_sampling, token, false); + } + + // compare the evaluated prompt with the new prompt + n_past = common_part(embd, prompt_tokens); + + embd = prompt_tokens; + if (n_past == num_prompt_tokens) + { + // we have to evaluate at least 1 token to generate logits. + n_past--; + } + + // since #3228 we now have to manually manage the KV cache + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + + LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s", + n_past, + tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(), + tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str() + ); + + has_next_token = true; +} + +void llama_rn_context::beginCompletion() { + // number of tokens to keep when resetting context + n_remain = params.n_predict; + llama_perf_context_reset(ctx); + is_predicting = true; +} + +completion_token_output llama_rn_context::nextToken() +{ + completion_token_output result; + result.tok = -1; + + if (embd.size() >= (size_t)params.n_ctx) + { + // Shift context + + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) + { + embd[i - n_discard] = embd[i]; + } + embd.resize(embd.size() - n_discard); + + n_past -= n_discard; + + LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s", + params.n_ctx, + params.n_keep, + n_left + ); + } + + bool tg = true; + while (n_past < embd.size()) + { + int n_eval = (int)embd.size() - n_past; + tg = n_eval == 1; + if (n_eval > params.n_batch) + { + n_eval = params.n_batch; + } + if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval))) + { + LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s", + n_eval, + n_past, + params.cpuparams.n_threads, + tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str() + ); + has_next_token = false; + return result; + } + n_past += n_eval; + + if(is_interrupted) { + LOG_INFO("Decoding Interrupted"); + embd.resize(n_past); + has_next_token = false; + return result; + } + } + + const llama_vocab* vocab = llama_model_get_vocab(model); + + if (params.n_predict == 0) + { + has_next_token = false; + result.tok = llama_vocab_eos(vocab); + return result; + } + + { + // out of user input, sample next token + std::vector candidates; + candidates.reserve(llama_vocab_n_tokens(vocab)); + + result.tok = common_sampler_sample(ctx_sampling, ctx, -1); + + llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling); + + const int32_t n_probs = params.sampling.n_probs; + + // deprecated + /*if (params.sampling.temp <= 0 && n_probs > 0) + { + // For llama_sample_token_greedy we need to sort candidates + llama_sampler_init_softmax(); + + }*/ + + + for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) + { + result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + } + + common_sampler_accept(ctx_sampling, result.tok, true); + if (tg) { + num_tokens_predicted++; + } + } + + // add it to the context + embd.push_back(result.tok); + // decrement remaining sampling budget + --n_remain; + + if (!embd.empty() && embd.back() == llama_vocab_eos(vocab)) + { + // stopping_word = llama_token_to_piece(ctx, embd.back()); + has_next_token = false; + stopped_eos = true; + LOG_VERBOSE("eos token found", ""); + return result; + } + + has_next_token = params.n_predict == -1 || n_remain != 0; + return result; +} + +size_t llama_rn_context::findStoppingStrings(const std::string &text, const size_t last_token_size, + const stop_type type) +{ + size_t stop_pos = std::string::npos; + for (const std::string &word : params.antiprompt) + { + size_t pos; + if (type == STOP_FULL) + { + const size_t tmp = word.size() + last_token_size; + const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; + pos = text.find(word, from_pos); + } + else + { + pos = find_partial_stop_string(word, text); + } + if (pos != std::string::npos && + (stop_pos == std::string::npos || pos < stop_pos)) + { + if (type == STOP_FULL) + { + stopping_word = word; + stopped_word = true; + has_next_token = false; + } + stop_pos = pos; + } + } + return stop_pos; +} + +completion_token_output llama_rn_context::doCompletion() +{ + const completion_token_output token_with_probs = nextToken(); + + const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok); + generated_text += token_text; + + if (params.sampling.n_probs > 0) + { + generated_token_probs.push_back(token_with_probs); + } + + // check if there is incomplete UTF-8 character at the end + for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) { + unsigned char c = generated_text[generated_text.size() - i]; + if ((c & 0xC0) == 0x80) { + // continuation byte: 10xxxxxx + continue; + } + if ((c & 0xE0) == 0xC0) { + // 2-byte character: 110xxxxx ... + incomplete = i < 2; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character: 1110xxxx ... + incomplete = i < 3; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character: 11110xxx ... + incomplete = i < 4; + } + // else 1-byte character or invalid byte + break; + } + + if (incomplete && !has_next_token) + { + has_next_token = true; + n_remain++; + } + + if (!has_next_token && n_remain == 0) + { + stopped_limit = true; + } + + LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s", + common_token_to_piece(ctx, token_with_probs.tok), + tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(), + has_next_token, + n_remain, + num_tokens_predicted, + stopped_eos, + stopped_word, + stopped_limit, + stopping_word.c_str() + ); + return token_with_probs; +} + +std::vector llama_rn_context::getEmbedding(common_params &embd_params) +{ + static const int n_embd = llama_model_n_embd(llama_get_model(ctx)); + if (!embd_params.embedding) + { + LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding); + return std::vector(n_embd, 0.0f); + } + float *data; + + const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); + printf("pooling_type: %d\n", pooling_type); + if (pooling_type == LLAMA_POOLING_TYPE_NONE) { + data = llama_get_embeddings(ctx); + } else { + data = llama_get_embeddings_seq(ctx, 0); + } + + if (!data) { + return std::vector(n_embd, 0.0f); + } + std::vector embedding(data, data + n_embd), out(data, data + n_embd); + common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize); + return out; +} + +std::string llama_rn_context::bench(int pp, int tg, int pl, int nr) +{ + if (is_predicting) { + LOG_ERROR("cannot benchmark while predicting", ""); + return std::string("[]"); + } + + is_predicting = true; + + double pp_avg = 0; + double tg_avg = 0; + + double pp_std = 0; + double tg_std = 0; + + // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30) + llama_batch batch = llama_batch_init( + std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch + 0, // No embeddings + 1 // Single sequence + ); + + for (int i = 0; i < nr; i++) + { + llama_batch_clear(&batch); + + const int n_tokens = pp; + + for (int i = 0; i < n_tokens; i++) + { + llama_batch_add(&batch, 0, i, {0}, false); + } + batch.logits[batch.n_tokens - 1] = 1; // true + + llama_kv_cache_clear(ctx); + + const int64_t t_pp_start = llama_time_us(); + if (llama_decode(ctx, batch) != 0) + { + LOG_ERROR("llama_decode() failed during prompt", ""); + } + const int64_t t_pp_end = llama_time_us(); + llama_kv_cache_clear(ctx); + + if (is_interrupted) break; + + const int64_t t_tg_start = llama_time_us(); + + for (int i = 0; i < tg; i++) + { + llama_batch_clear(&batch); + + for (int j = 0; j < pl; j++) + { + llama_batch_add(&batch, 0, i, {j}, true); + } + + if (llama_decode(ctx, batch) != 0) + { + LOG_ERROR("llama_decode() failed during text generation", ""); + } + if (is_interrupted) break; + } + + const int64_t t_tg_end = llama_time_us(); + + llama_kv_cache_clear(ctx); + + const double t_pp = (t_pp_end - t_pp_start) / 1000000.0; + const double t_tg = (t_tg_end - t_tg_start) / 1000000.0; + + const double speed_pp = pp / t_pp; + const double speed_tg = (pl * tg) / t_tg; + + pp_avg += speed_pp; + tg_avg += speed_tg; + + pp_std += speed_pp * speed_pp; + tg_std += speed_tg * speed_tg; + } + + pp_avg /= nr; + tg_avg /= nr; + + if (nr > 1) { + pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1)); + tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1)); + } else { + pp_std = 0; + tg_std = 0; + } + + if (is_interrupted) llama_kv_cache_clear(ctx); + is_predicting = false; + + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + return std::string("[\"") + model_desc + std::string("\",") + + std::to_string(llama_model_size(model)) + std::string(",") + + std::to_string(llama_model_n_params(model)) + std::string(",") + + std::to_string(pp_avg) + std::string(",") + + std::to_string(pp_std) + std::string(",") + + std::to_string(tg_avg) + std::string(",") + + std::to_string(tg_std) + + std::string("]"); +} + +int llama_rn_context::applyLoraAdapters(std::vector lora) { + for (auto &la : lora) { + la.ptr = llama_adapter_lora_init(model, la.path.c_str()); + if (la.ptr == nullptr) { + LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str()); + return -1; + } + } + this->lora = lora; + common_set_adapter_lora(ctx, lora); + return 0; +} + +void llama_rn_context::removeLoraAdapters() { + this->lora.clear(); + common_set_adapter_lora(ctx, this->lora); // apply empty list +} + +std::vector llama_rn_context::getLoadedLoraAdapters() { + return this->lora; +} + +} diff --git a/cpp/rn-llama.h b/cpp/rn-llama.h new file mode 100644 index 00000000..0e98998a --- /dev/null +++ b/cpp/rn-llama.h @@ -0,0 +1,119 @@ +#ifndef RNLLAMA_H +#define RNLLAMA_H + +#include +#include +#include "common.h" +#include "ggml.h" +#include "gguf.h" +#include "llama.h" +#include "llama-impl.h" +#include "sampling.h" +#if defined(__ANDROID__) +#include +#endif + +namespace rnllama { + +std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token); + +std::string tokens_to_str(llama_context *ctx, const std::vector::const_iterator begin, const std::vector::const_iterator end); + +lm_ggml_type kv_cache_type_from_str(const std::string & s); + +enum stop_type +{ + STOP_FULL, + STOP_PARTIAL, +}; + +// completion token output with probabilities +struct completion_token_output +{ + struct token_prob + { + llama_token tok; + float prob; + }; + + std::vector probs; + llama_token tok; +}; + +// Main context class +struct llama_rn_context { + bool is_predicting = false; + bool is_interrupted = false; + bool has_next_token = false; + std::string generated_text; + std::vector generated_token_probs; + + size_t num_prompt_tokens = 0; + size_t num_tokens_predicted = 0; + size_t n_past = 0; + size_t n_remain = 0; + + std::vector embd; + common_params params; + common_init_result llama_init; + + llama_model *model = nullptr; + float loading_progress = 0; + bool is_load_interrupted = false; + + llama_context *ctx = nullptr; + common_sampler *ctx_sampling = nullptr; + + int n_ctx; + + bool truncated = false; + bool stopped_eos = false; + bool stopped_word = false; + bool stopped_limit = false; + std::string stopping_word; + bool incomplete = false; + + std::vector lora; + + ~llama_rn_context(); + + void rewind(); + bool initSampling(); + bool loadModel(common_params ¶ms_); + bool validateModelChatTemplate() const; + void truncatePrompt(std::vector &prompt_tokens); + void loadPrompt(); + void beginCompletion(); + completion_token_output nextToken(); + size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type); + completion_token_output doCompletion(); + std::vector getEmbedding(common_params &embd_params); + std::string bench(int pp, int tg, int pl, int nr); + int applyLoraAdapters(std::vector lora); + void removeLoraAdapters(); + std::vector getLoadedLoraAdapters(); +};\ + +// Logging macros +extern bool rnllama_verbose; + +#if RNLLAMA_VERBOSE != 1 +#define LOG_VERBOSE(MSG, ...) +#else +#define LOG_VERBOSE(MSG, ...) \ + do \ + { \ + if (rnllama_verbose) \ + { \ + log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__) +#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__) +#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__) + +} // namespace rnllama + +#endif /* RNLLAMA_H */ diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp deleted file mode 100644 index a386cf1e..00000000 --- a/cpp/rn-llama.hpp +++ /dev/null @@ -1,737 +0,0 @@ -#ifndef RNLLAMA_H -#define RNLLAMA_H - -#include -#include -#include "common.h" -#include "ggml.h" -#include "gguf.h" -#include "llama.h" -#include "llama-impl.h" -#include "sampling.h" -#if defined(__ANDROID__) -#include -#endif - -namespace rnllama { - -const std::vector kv_cache_types = { - LM_GGML_TYPE_F32, - LM_GGML_TYPE_F16, - LM_GGML_TYPE_BF16, - LM_GGML_TYPE_Q8_0, - LM_GGML_TYPE_Q4_0, - LM_GGML_TYPE_Q4_1, - LM_GGML_TYPE_IQ4_NL, - LM_GGML_TYPE_Q5_0, - LM_GGML_TYPE_Q5_1, -}; - -static lm_ggml_type kv_cache_type_from_str(const std::string & s) { - for (const auto & type : kv_cache_types) { - if (lm_ggml_type_name(type) == s) { - return type; - } - } - throw std::runtime_error("Unsupported cache type: " + s); -} - -static void llama_batch_clear(llama_batch *batch) { - batch->n_tokens = 0; -} - -static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector seq_ids, bool logits) { - batch->token [batch->n_tokens] = id; - batch->pos [batch->n_tokens] = pos; - batch->n_seq_id[batch->n_tokens] = seq_ids.size(); - for (size_t i = 0; i < seq_ids.size(); i++) { - batch->seq_id[batch->n_tokens][i] = seq_ids[i]; - } - batch->logits [batch->n_tokens] = logits ? 1 : 0; - batch->n_tokens += 1; -} - -// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp - -static void log(const char *level, const char *function, int line, - const char *format, ...) -{ - va_list args; - #if defined(__ANDROID__) - char prefix[256]; - snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format); - - va_start(args, format); - android_LogPriority priority; - if (strcmp(level, "ERROR") == 0) { - priority = ANDROID_LOG_ERROR; - } else if (strcmp(level, "WARNING") == 0) { - priority = ANDROID_LOG_WARN; - } else if (strcmp(level, "INFO") == 0) { - priority = ANDROID_LOG_INFO; - } else { - priority = ANDROID_LOG_DEBUG; - } - __android_log_vprint(priority, "RNLlama", prefix, args); - va_end(args); - #else - printf("[%s] %s:%d ", level, function, line); - va_start(args, format); - vprintf(format, args); - va_end(args); - printf("\n"); - #endif -} -static bool rnllama_verbose = false; - -#if RNLLAMA_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (rnllama_verbose) \ - { \ - log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__) -#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__) -#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__) - -enum stop_type -{ - STOP_FULL, - STOP_PARTIAL, -}; - -// completion token output with probabilities -struct completion_token_output -{ - struct token_prob - { - llama_token tok; - float prob; - }; - - std::vector probs; - llama_token tok; -}; - -static size_t common_part(const std::vector &a, const std::vector &b) -{ - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) - { - } - return i; -} - -static bool ends_with(const std::string &str, const std::string &suffix) -{ - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static size_t find_partial_stop_string(const std::string &stop, - const std::string &text) -{ - if (!text.empty() && !stop.empty()) - { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) - { - if (stop[char_index] == text_last_char) - { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) - { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} - -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) -{ - std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) - { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} - -template -static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) -{ - std::string ret; - for (; begin != end; ++begin) - { - ret += common_token_to_piece(ctx, *begin); - } - return ret; -} - -struct llama_rn_context -{ - bool is_predicting = false; - bool is_interrupted = false; - bool has_next_token = false; - std::string generated_text; - std::vector generated_token_probs; - - size_t num_prompt_tokens = 0; - size_t num_tokens_predicted = 0; - size_t n_past = 0; - size_t n_remain = 0; - - std::vector embd; - - common_params params; - - common_init_result llama_init; - - llama_model *model = nullptr; - float loading_progress = 0; - bool is_load_interrupted = false; - - llama_context *ctx = nullptr; - common_sampler *ctx_sampling = nullptr; - - int n_ctx; - - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - std::string stopping_word; - bool incomplete = false; - - std::vector lora; - - ~llama_rn_context() - { - if (ctx_sampling != nullptr) - { - common_sampler_free(ctx_sampling); - } - } - - void rewind() - { - is_interrupted = false; - params.antiprompt.clear(); - params.sampling.grammar.clear(); - num_prompt_tokens = 0; - num_tokens_predicted = 0; - generated_text = ""; - generated_text.reserve(params.n_ctx); - generated_token_probs.clear(); - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - incomplete = false; - n_remain = 0; - n_past = 0; - params.sampling.n_prev = n_ctx; - } - - bool initSampling() { - if (ctx_sampling != nullptr) { - common_sampler_free(ctx_sampling); - } - ctx_sampling = common_sampler_init(model, params.sampling); - return ctx_sampling != nullptr; - } - - bool loadModel(common_params ¶ms_) - { - params = params_; - llama_init = common_init_from_params(params); - model = llama_init.model.get(); - ctx = llama_init.context.get(); - if (model == nullptr) - { - LOG_ERROR("unable to load model: %s", params_.model.c_str()); - return false; - } - n_ctx = llama_n_ctx(ctx); - - // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101 - // LOG_INFO("%s\n", common_params_get_system_info(params).c_str()); - - return true; - } - - bool validateModelChatTemplate() const { - const char * tmpl = llama_model_chat_template(model); - llama_chat_message chat[] = {{"user", "test"}}; - int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); - return chat_res > 0; - } - - void truncatePrompt(std::vector &prompt_tokens) { - const int n_left = n_ctx - params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size; - - // Keep n_keep tokens at start of prompt (at most n_ctx - 4) - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep); - - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); - - LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d", - n_ctx, - params.n_keep, - n_left, - tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(), - new_tokens.size() - ); - - truncated = true; - prompt_tokens = new_tokens; - } - - void loadPrompt() - { - std::vector prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true); - num_prompt_tokens = prompt_tokens.size(); - - // LOG tokens - std::stringstream ss; - ss << "\n" << __func__ << ": prompt_tokens = "; - for (auto& token : prompt_tokens) { - ss << token << " "; - } - LOG_INFO("%s\n", ss.str().c_str()); - - if (params.n_keep < 0) - { - params.n_keep = (int)num_prompt_tokens; - } - params.n_keep = std::min(n_ctx - 4, params.n_keep); - - // if input prompt is too big, truncate like normal - if (num_prompt_tokens >= (size_t) n_ctx) - { - truncatePrompt(prompt_tokens); - num_prompt_tokens = prompt_tokens.size(); - - LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx); - } - // push the prompt into the sampling context (do not apply grammar) - for (auto & token : prompt_tokens) - { - common_sampler_accept(ctx_sampling, token, false); - } - - // compare the evaluated prompt with the new prompt - n_past = common_part(embd, prompt_tokens); - - embd = prompt_tokens; - if (n_past == num_prompt_tokens) - { - // we have to evaluate at least 1 token to generate logits. - n_past--; - } - - // since #3228 we now have to manually manage the KV cache - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - - LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s", - n_past, - tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(), - tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str() - ); - - has_next_token = true; - } - - void beginCompletion() - { - // number of tokens to keep when resetting context - n_remain = params.n_predict; - llama_perf_context_reset(ctx); - is_predicting = true; - } - - completion_token_output nextToken() - { - completion_token_output result; - result.tok = -1; - - if (embd.size() >= (size_t)params.n_ctx) - { - // Shift context - - const int n_left = n_past - params.n_keep - 1; - const int n_discard = n_left/2; - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); - - for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++) - { - embd[i - n_discard] = embd[i]; - } - embd.resize(embd.size() - n_discard); - - n_past -= n_discard; - - LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s", - params.n_ctx, - params.n_keep, - n_left - ); - } - - bool tg = true; - while (n_past < embd.size()) - { - int n_eval = (int)embd.size() - n_past; - tg = n_eval == 1; - if (n_eval > params.n_batch) - { - n_eval = params.n_batch; - } - if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval))) - { - LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s", - n_eval, - n_past, - params.cpuparams.n_threads, - tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str() - ); - has_next_token = false; - return result; - } - n_past += n_eval; - - if(is_interrupted) { - LOG_INFO("Decoding Interrupted"); - embd.resize(n_past); - has_next_token = false; - return result; - } - } - - const llama_vocab* vocab = llama_model_get_vocab(model); - - if (params.n_predict == 0) - { - has_next_token = false; - result.tok = llama_vocab_eos(vocab); - return result; - } - - { - // out of user input, sample next token - std::vector candidates; - candidates.reserve(llama_vocab_n_tokens(vocab)); - - result.tok = common_sampler_sample(ctx_sampling, ctx, -1); - - llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling); - - const int32_t n_probs = params.sampling.n_probs; - - // deprecated - /*if (params.sampling.temp <= 0 && n_probs > 0) - { - // For llama_sample_token_greedy we need to sort candidates - llama_sampler_init_softmax(); - - }*/ - - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) - { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); - } - - common_sampler_accept(ctx_sampling, result.tok, true); - if (tg) { - num_tokens_predicted++; - } - } - - // add it to the context - embd.push_back(result.tok); - // decrement remaining sampling budget - --n_remain; - - if (!embd.empty() && embd.back() == llama_vocab_eos(vocab)) - { - // stopping_word = llama_token_to_piece(ctx, embd.back()); - has_next_token = false; - stopped_eos = true; - LOG_VERBOSE("eos token found", ""); - return result; - } - - has_next_token = params.n_predict == -1 || n_remain != 0; - return result; - } - - size_t findStoppingStrings(const std::string &text, const size_t last_token_size, - const stop_type type) - { - size_t stop_pos = std::string::npos; - for (const std::string &word : params.antiprompt) - { - size_t pos; - if (type == STOP_FULL) - { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } - else - { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) - { - if (type == STOP_FULL) - { - stopping_word = word; - stopped_word = true; - has_next_token = false; - } - stop_pos = pos; - } - } - return stop_pos; - } - - completion_token_output doCompletion() - { - const completion_token_output token_with_probs = nextToken(); - - const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok); - generated_text += token_text; - - if (params.sampling.n_probs > 0) - { - generated_token_probs.push_back(token_with_probs); - } - - // check if there is incomplete UTF-8 character at the end - for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) { - unsigned char c = generated_text[generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } - - if (incomplete && !has_next_token) - { - has_next_token = true; - n_remain++; - } - - if (!has_next_token && n_remain == 0) - { - stopped_limit = true; - } - - LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s", - common_token_to_piece(ctx, token_with_probs.tok), - tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(), - has_next_token, - n_remain, - num_tokens_predicted, - stopped_eos, - stopped_word, - stopped_limit, - stopping_word.c_str() - ); - return token_with_probs; - } - - std::vector getEmbedding(common_params &embd_params) - { - static const int n_embd = llama_model_n_embd(llama_get_model(ctx)); - if (!embd_params.embedding) - { - LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding); - return std::vector(n_embd, 0.0f); - } - float *data; - - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - printf("pooling_type: %d\n", pooling_type); - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - data = llama_get_embeddings(ctx); - } else { - data = llama_get_embeddings_seq(ctx, 0); - } - - if (!data) { - return std::vector(n_embd, 0.0f); - } - std::vector embedding(data, data + n_embd), out(data, data + n_embd); - common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize); - return out; - } - - std::string bench(int pp, int tg, int pl, int nr) - { - if (is_predicting) { - LOG_ERROR("cannot benchmark while predicting", ""); - return std::string("[]"); - } - - is_predicting = true; - - double pp_avg = 0; - double tg_avg = 0; - - double pp_std = 0; - double tg_std = 0; - - // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30) - llama_batch batch = llama_batch_init( - std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch - 0, // No embeddings - 1 // Single sequence - ); - - for (int i = 0; i < nr; i++) - { - llama_batch_clear(&batch); - - const int n_tokens = pp; - - for (int i = 0; i < n_tokens; i++) - { - llama_batch_add(&batch, 0, i, {0}, false); - } - batch.logits[batch.n_tokens - 1] = 1; // true - - llama_kv_cache_clear(ctx); - - const int64_t t_pp_start = llama_time_us(); - if (llama_decode(ctx, batch) != 0) - { - LOG_ERROR("llama_decode() failed during prompt", ""); - } - const int64_t t_pp_end = llama_time_us(); - llama_kv_cache_clear(ctx); - - if (is_interrupted) break; - - const int64_t t_tg_start = llama_time_us(); - - for (int i = 0; i < tg; i++) - { - llama_batch_clear(&batch); - - for (int j = 0; j < pl; j++) - { - llama_batch_add(&batch, 0, i, {j}, true); - } - - if (llama_decode(ctx, batch) != 0) - { - LOG_ERROR("llama_decode() failed during text generation", ""); - } - if (is_interrupted) break; - } - - const int64_t t_tg_end = llama_time_us(); - - llama_kv_cache_clear(ctx); - - const double t_pp = (t_pp_end - t_pp_start) / 1000000.0; - const double t_tg = (t_tg_end - t_tg_start) / 1000000.0; - - const double speed_pp = pp / t_pp; - const double speed_tg = (pl * tg) / t_tg; - - pp_avg += speed_pp; - tg_avg += speed_tg; - - pp_std += speed_pp * speed_pp; - tg_std += speed_tg * speed_tg; - } - - pp_avg /= nr; - tg_avg /= nr; - - if (nr > 1) { - pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1)); - tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1)); - } else { - pp_std = 0; - tg_std = 0; - } - - if (is_interrupted) llama_kv_cache_clear(ctx); - is_predicting = false; - - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - return std::string("[\"") + model_desc + std::string("\",") + - std::to_string(llama_model_size(model)) + std::string(",") + - std::to_string(llama_model_n_params(model)) + std::string(",") + - std::to_string(pp_avg) + std::string(",") + - std::to_string(pp_std) + std::string(",") + - std::to_string(tg_avg) + std::string(",") + - std::to_string(tg_std) + - std::string("]"); - } - - int applyLoraAdapters(std::vector lora) { - for (auto &la : lora) { - la.ptr = llama_adapter_lora_init(model, la.path.c_str()); - if (la.ptr == nullptr) { - LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str()); - return -1; - } - } - this->lora = lora; - common_set_adapter_lora(ctx, lora); - return 0; - } - - void removeLoraAdapters() { - this->lora.clear(); - common_set_adapter_lora(ctx, this->lora); // apply empty list - } - - std::vector getLoadedLoraAdapters() { - return this->lora; - } -}; - -} - -#endif /* LLAMA_H */ diff --git a/example/README.md b/example/README.md new file mode 100644 index 00000000..0d9d6e10 --- /dev/null +++ b/example/README.md @@ -0,0 +1,63 @@ +# llama.rn example + +This is an example of how to use the llama.rn library. + +This example used [react-native-document-picker](https://github.com/rnmods/react-native-document-picker) for select model. + +- iOS: You can move the model to iOS Simulator, or iCloud for real device. +- Android: Selected file will be copied or downloaded to cache directory so it may be slow. + +## Requirements + +Please back to the root directory and run the following command: + +```bash +yarn && yarn bootstrap +``` + +## iOS + +1. Install pods + +```bash +yarn pods +``` + +2. Run the example + +```bash +yarn ios +# Use device +yarn ios --device "" +# With release mode +yarn ios --mode Release +``` + +## Android + +Run the example: +```bash +yarn android +# With release mode +yarn android --mode release +``` + +## Build with frameworks/libs + +This example is build llama.rn from source code by default, you can also build with frameworks/libs. + +```bash +# Build iOS frameworks +yarn build:ios-frameworks +# Build Android libs +yarn build:android-libs +``` + +Then you can setup the environment variable / properties in your project: + +iOS: +```bash +RNLLAMA_BUILD_FROM_SOURCE=0 yarn pods +``` + +Android: Edit `android/gradle.properties` and set `rnllamaBuildFromSource` to `false`. diff --git a/example/android/gradle.properties b/example/android/gradle.properties index e1ddc51c..6947ba4b 100644 --- a/example/android/gradle.properties +++ b/example/android/gradle.properties @@ -42,3 +42,7 @@ newArchEnabled=true # Use this property to enable or disable the Hermes JS engine. # If set to false, you will be using JSC instead. hermesEnabled=true + +# Use this property to enable or disable the RNLLAMA build from source. +# If set to true, the RNLLAMA library will be built from source. +rnllamaBuildFromSource=true diff --git a/example/ios/.xcode.env.local b/example/ios/.xcode.env.local index 7793e02a..ed82726d 100644 --- a/example/ios/.xcode.env.local +++ b/example/ios/.xcode.env.local @@ -1 +1 @@ -export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1737601800519-0.4653948355776887/node +export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1737683137807-0.2592527908357527/node diff --git a/example/ios/Podfile b/example/ios/Podfile index 011b69e6..657296c3 100644 --- a/example/ios/Podfile +++ b/example/ios/Podfile @@ -18,6 +18,8 @@ end ENV['RCT_NEW_ARCH_ENABLED'] = '1' +ENV['RNLLAMA_BUILD_FROM_SOURCE'] = ENV['RNLLAMA_BUILD_FROM_SOURCE'] || '1' + target 'RNLlamaExample' do config = use_native_modules! diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock index 78d05732..7efe8517 100644 --- a/example/ios/Podfile.lock +++ b/example/ios/Podfile.lock @@ -1270,7 +1270,7 @@ SPEC CHECKSUMS: glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322 libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913 - llama-rn: 1510d60298c2a0dcec8ff2922b1689daea663d9e + llama-rn: c90e797ab8ba372d7c8c47b2d7703b0a6c45d048 RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1 RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18 RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3 @@ -1314,6 +1314,6 @@ SPEC CHECKSUMS: SocketRocket: f32cd54efbe0f095c4d7594881e52619cfe80b17 Yoga: 8796b55dba14d7004f980b54bcc9833ee45b28ce -PODFILE CHECKSUM: cf6cc1e14840d7bc13c75e90df0f55023f1d4ad2 +PODFILE CHECKSUM: 65536592e23bc5ecf357f53e2deda450e33f1c90 COCOAPODS: 1.15.2 diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt new file mode 100644 index 00000000..eb52f746 --- /dev/null +++ b/ios/CMakeLists.txt @@ -0,0 +1,99 @@ +cmake_minimum_required(VERSION 3.16) +project(rnllama VERSION 1.0.0 LANGUAGES CXX C) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# iOS specific settings +set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0) +set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO) + +# Dependencies and compile options +add_definitions( + -DNDEBUG + -DO3 + -DLM_GGML_USE_CPU + -DLM_GGML_USE_ACCELERATE + -DLM_GGML_USE_METAL +) + +set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp) + +# Define public headers +set(PUBLIC_HEADERS + ${SOURCE_DIR}/rn-llama.h + ${SOURCE_DIR}/llama.h + ${SOURCE_DIR}/llama-impl.h + ${SOURCE_DIR}/ggml.h +) + +# Create library target +add_library(rnllama SHARED + ${SOURCE_DIR}/ggml.c + ${SOURCE_DIR}/ggml-alloc.c + ${SOURCE_DIR}/ggml-backend.cpp + ${SOURCE_DIR}/ggml-backend-reg.cpp + ${SOURCE_DIR}/ggml-cpu.c + ${SOURCE_DIR}/ggml-cpu.cpp + ${SOURCE_DIR}/ggml-cpu-aarch64.cpp + ${SOURCE_DIR}/ggml-cpu-quants.c + ${SOURCE_DIR}/ggml-cpu-traits.cpp + ${SOURCE_DIR}/ggml-metal.m + ${SOURCE_DIR}/ggml-opt.cpp + ${SOURCE_DIR}/ggml-threading.cpp + ${SOURCE_DIR}/ggml-quants.c + ${SOURCE_DIR}/gguf.cpp + ${SOURCE_DIR}/log.cpp + ${SOURCE_DIR}/llama-impl.cpp + ${SOURCE_DIR}/llama-grammar.cpp + ${SOURCE_DIR}/llama-sampling.cpp + ${SOURCE_DIR}/llama-vocab.cpp + ${SOURCE_DIR}/llama-adapter.cpp + ${SOURCE_DIR}/llama-chat.cpp + ${SOURCE_DIR}/llama-context.cpp + ${SOURCE_DIR}/llama-kv-cache.cpp + ${SOURCE_DIR}/llama-arch.cpp + ${SOURCE_DIR}/llama-batch.cpp + ${SOURCE_DIR}/llama-cparams.cpp + ${SOURCE_DIR}/llama-hparams.cpp + ${SOURCE_DIR}/llama.cpp + ${SOURCE_DIR}/llama-model.cpp + ${SOURCE_DIR}/llama-model-loader.cpp + ${SOURCE_DIR}/llama-mmap.cpp + ${SOURCE_DIR}/llama-vocab.cpp + ${SOURCE_DIR}/sampling.cpp + ${SOURCE_DIR}/unicode-data.cpp + ${SOURCE_DIR}/unicode.cpp + ${SOURCE_DIR}/sgemm.cpp + ${SOURCE_DIR}/common.cpp + ${SOURCE_DIR}/amx/amx.cpp + ${SOURCE_DIR}/amx/mmq.cpp + ${SOURCE_DIR}/rn-llama.cpp +) + +# Setup include directories +target_include_directories(rnllama + PUBLIC + $ + $ +) + +# Link required frameworks +target_link_libraries(rnllama PRIVATE + "-framework Accelerate" + "-framework Foundation" + "-framework Metal" + "-framework MetalKit" +) + +# Set properties for framework +set_target_properties(rnllama PROPERTIES + MACOSX_FRAMEWORK_IDENTIFIER "com.rnllama" + MACOSX_FRAMEWORK_BUNDLE_VERSION 1.0.0 + MACOSX_FRAMEWORK_SHORT_VERSION_STRING 1.0.0 + FRAMEWORK TRUE + FRAMEWORK_VERSION 1.0.0 + VERSION 1.0.0 + PUBLIC_HEADER "${PUBLIC_HEADERS}" + XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO +) diff --git a/ios/RNLlama.h b/ios/RNLlama.h index f52638ef..e58cf614 100644 --- a/ios/RNLlama.h +++ b/ios/RNLlama.h @@ -1,5 +1,9 @@ #ifdef __cplusplus -#import "rn-llama.hpp" +#if RNLLAMA_BUILD_FROM_SOURCE +#import "rn-llama.h" +#else +#import +#endif #endif #import diff --git a/ios/RNLlamaContext.h b/ios/RNLlamaContext.h index 82bcccda..153715e4 100644 --- a/ios/RNLlamaContext.h +++ b/ios/RNLlamaContext.h @@ -1,8 +1,15 @@ #ifdef __cplusplus +#if RNLLAMA_BUILD_FROM_SOURCE #import "llama.h" #import "llama-impl.h" #import "ggml.h" -#import "rn-llama.hpp" +#import "rn-llama.h" +#else +#import +#import +#import +#import +#endif #endif diff --git a/ios/rnllama.xcframework/Info.plist b/ios/rnllama.xcframework/Info.plist new file mode 100644 index 00000000..91b8b440 --- /dev/null +++ b/ios/rnllama.xcframework/Info.plist @@ -0,0 +1,74 @@ + + + + + AvailableLibraries + + + LibraryIdentifier + ios-arm64 + LibraryPath + rnllama.framework + SupportedArchitectures + + arm64 + + SupportedPlatform + ios + + + LibraryIdentifier + ios-arm64_x86_64-simulator + LibraryPath + rnllama.framework + SupportedArchitectures + + arm64 + x86_64 + + SupportedPlatform + ios + SupportedPlatformVariant + simulator + + + LibraryIdentifier + tvos-arm64 + LibraryPath + rnllama.framework + SupportedArchitectures + + arm64 + + SupportedPlatform + tvos + + + LibraryIdentifier + tvos-arm64_x86_64-simulator + LibraryPath + rnllama.framework + SupportedArchitectures + + arm64 + x86_64 + + SupportedPlatform + tvos + SupportedPlatformVariant + simulator + + + + CFBundlePackageType + XFWK + XCFrameworkFormatVersion + 1.0 + CFBundleVersion + 1.0.0 + CFBundleShortVersionString + 1.0.0 + CFBundleIdentifier + com.rnllama + + diff --git a/llama-rn.podspec b/llama-rn.podspec index 596a6e08..b97a7dfa 100644 --- a/llama-rn.podspec +++ b/llama-rn.podspec @@ -23,8 +23,14 @@ Pod::Spec.new do |s| s.platforms = { :ios => "13.0", :tvos => "13.0" } s.source = { :git => "https://github.com/mybigday/llama.rn.git", :tag => "#{s.version}" } - s.source_files = "ios/**/*.{h,m,mm}", "cpp/**/*.{h,cpp,hpp,c,m,mm}" - s.resources = "cpp/**/*.{metallib}" + if ENV["RNLLAMA_BUILD_FROM_SOURCE"] == "1" + s.source_files = "ios/**/*.{h,m,mm}", "cpp/**/*.{h,cpp,hpp,c,m,mm}" + s.resources = "cpp/**/*.{metallib}" + base_compiler_flags += " -DRNLLAMA_BUILD_FROM_SOURCE" + else + s.source_files = "ios/**/*.{h,m,mm}" + s.vendored_frameworks = "ios/rnllama.xcframework" + end s.dependency "React-Core" diff --git a/package.json b/package.json index f86e3a7c..ac1a80c2 100644 --- a/package.json +++ b/package.json @@ -33,10 +33,12 @@ "test": "jest", "typecheck": "tsc --noEmit", "lint": "eslint \"**/*.{js,ts,tsx}\"", - "prepack": "bob build", + "prepack": "./scripts/build-ios.sh && ./scripts/build-android.sh && bob build", "release": "release-it", "example": "yarn --cwd example", + "build:ios-frameworks": "./scripts/build-ios.sh", "build:ios": "cd example/ios && xcodebuild -workspace RNLlamaExample.xcworkspace -scheme RNLlamaExample -configuration Debug -sdk iphonesimulator CC=clang CPLUSPLUS=clang++ LD=clang LDPLUSPLUS=clang++ GCC_OPTIMIZATION_LEVEL=0 GCC_PRECOMPILE_PREFIX_HEADER=YES ASSETCATALOG_COMPILER_OPTIMIZATION=time DEBUG_INFORMATION_FORMAT=dwarf COMPILER_INDEX_STORE_ENABLE=NO", + "build:android-libs": "./scripts/build-android.sh", "build:android": "cd example/android && ./gradlew assembleDebug", "clean": "del-cli example/ios/build" }, diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index ca53a128..b6a983c4 100755 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -227,12 +227,12 @@ patch -p0 -d ./cpp < ./scripts/ggml-quants.c.patch patch -p0 -d ./cpp < ./scripts/llama-mmap.cpp.patch if [ "$OS" = "Darwin" ]; then - # Build metallib (~1.4MB) + # Build metallib (~2.6MB) cd llama.cpp/ggml/src/ggml-metal xcrun --sdk iphoneos metal -c ggml-metal.metal -o ggml-metal.air xcrun --sdk iphoneos metallib ggml-metal.air -o ggml-llama.metallib rm ggml-metal.air - cp ./ggml-llama.metallib ../../../../cpp/ggml-llama.metallib + mv ./ggml-llama.metallib ../../../../cpp/ggml-llama.metallib cd - diff --git a/scripts/build-android.sh b/scripts/build-android.sh new file mode 100755 index 00000000..74048039 --- /dev/null +++ b/scripts/build-android.sh @@ -0,0 +1,40 @@ +#! /bin/bash + +NDK_VERSION=26.1.10909125 +CMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk/$NDK_VERSION/build/cmake/android.toolchain.cmake +ANDROID_PLATFORM=android-21 +CMAKE_BUILD_TYPE=Release + +cd android/src/main + +# Build the Android library (arm64-v8a) +cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_TOOLCHAIN_FILE \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=$ANDROID_PLATFORM \ + -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE \ + -B build-arm64 + +cmake --build build-arm64 --config Release + +mkdir -p jniLibs/arm64-v8a + +# Copy the library to the example app +cp build-arm64/*.so jniLibs/arm64-v8a/ + +rm -rf build-arm64 + +# Build the Android library (x86_64) +cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_TOOLCHAIN_FILE \ + -DANDROID_ABI=x86_64 \ + -DANDROID_PLATFORM=$ANDROID_PLATFORM \ + -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE \ + -B build-x86_64 + +cmake --build build-x86_64 --config Release + +mkdir -p jniLibs/x86_64 + +# Copy the library to the example app +cp build-x86_64/*.so jniLibs/x86_64/ + +rm -rf build-x86_64 diff --git a/scripts/build-ios.sh b/scripts/build-ios.sh new file mode 100755 index 00000000..acdffa82 --- /dev/null +++ b/scripts/build-ios.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +function cp_headers() { + mkdir -p ../ios/rnllama.xcframework/$1/rnllama.framework/Headers + cp ../cpp/*.h ../ios/rnllama.xcframework/$1/rnllama.framework/Headers/ +} + +function build_framework() { + # Parameters: + # $1: system_name (iOS/tvOS) + # $2: architectures + # $3: sysroot + # $4: output_path + # $5: build_dir + + cd $5 + + # Configure CMake + cmake ../ios \ + -GXcode \ + -DCMAKE_SYSTEM_NAME=$1 \ + -DCMAKE_OSX_ARCHITECTURES="$2" \ + -DCMAKE_OSX_SYSROOT=$3 \ + -DCMAKE_INSTALL_PREFIX=`pwd`/install \ + -DCMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH=NO \ + -DCMAKE_IOS_INSTALL_COMBINED=YES + + # Build + cmake --build . --config Release + + # Setup framework directory + rm -rf ../ios/rnllama.xcframework/$4 + mkdir -p ../ios/rnllama.xcframework/$4 + mv Release-$3/rnllama.framework ../ios/rnllama.xcframework/$4/rnllama.framework + mkdir -p ../ios/rnllama.xcframework/$4/rnllama.framework/Headers + + # Copy headers and metallib + cp_headers $4 + + # TODO: May need to re-build metallib for tvOS + cp ../cpp/ggml-llama.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama.metallib + + rm -rf ./* +} + +rm -rf build-ios +mkdir -p build-ios + +# Build iOS frameworks +build_framework "iOS" "arm64;x86_64" "iphonesimulator" "ios-arm64_x86_64-simulator" "build-ios" +build_framework "iOS" "arm64" "iphoneos" "ios-arm64" "build-ios" + +cd .. +rm -rf build-ios + +rm -rf build-tvos +mkdir -p build-tvos + +# Build tvOS frameworks +build_framework "tvOS" "arm64;x86_64" "appletvsimulator" "tvos-arm64_x86_64-simulator" "build-tvos" +build_framework "tvOS" "arm64" "appletvos" "tvos-arm64" "build-tvos" + +cd .. +rm -rf build-tvos