diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c165b281..ae4f8b8d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,7 +35,7 @@ jobs:
       - name: Run unit tests
         run: yarn test --maxWorkers=2 --coverage
 
-  build-ios:
+  build-ios-from-source:
     runs-on: macos-latest
     steps:
       - name: Checkout
@@ -69,7 +69,55 @@ jobs:
         run: |
           yarn build:ios
 
-  build-android:
+  build-ios-frameworks:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup
+        uses: ./.github/actions/setup
+
+      - name: Cache build & pods
+        uses: actions/cache@v3
+        with:
+          path: |
+            example/ios/Pods
+            example/ios/build
+          key: ${{ runner.os }}-pods-${{ hashFiles('example/ios/Podfile.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-pods-
+
+      - name: Upgrade CocoaPods to version 1.15.2
+        run: |
+          gem uninstall cocoapods --ignore-dependencies
+          gem install cocoapods -v 1.15.2
+
+      - name: Install cocoapods
+        run: |
+          yarn example pods
+        env:
+          NO_FLIPPER: 1
+          RNLLAMA_BUILD_FROM_SOURCE: 0
+
+      - name: Build example for iOS
+        run: |
+          yarn build:ios
+
+  build-android-from-source:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Setup
+        uses: ./.github/actions/setup
+
+      - name: Build example for Android
+        run: |
+          yarn build:android
+
+  build-android-libs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -80,4 +128,5 @@ jobs:
 
       - name: Build example for Android
         run: |
+          sed -i 's/rnllamaBuildFromSource=true/rnllamaBuildFromSource=false/g' example/android/gradle.properties
           yarn build:android
diff --git a/.gitignore b/.gitignore
index c5eda256..37724765 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,3 +73,11 @@ docs/API/.nojekyll
 
 *.metallib
 .xocde.env.local
+
+build-arm64/
+build-x86_64/
+jniLibs/
+build-ios/
+build-tvos/
+
+ios/rnllama.xcframework/**/*.framework
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7716e3ac..bbdfb61d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -17,22 +17,6 @@ yarn bootstrap
 
 While developing, you can run the [example app](/example/) to test your changes. Any changes you make in your library's JavaScript code will be reflected in the example app without a rebuild. If you change any native code, then you'll need to rebuild the example app.
 
-To start the packager:
-
-```sh
-yarn example start
-```
-
-To run the example app on iOS:
-
-```sh
-yarn example pods
-yarn example ios
-```
-
-For test better performance on completion, you can run the app in Release mode:
-- iOS: `yarn example ios --mode Release`
-
 Make sure your code passes TypeScript and ESLint. Run the following to verify:
 
 ```sh
diff --git a/README.md b/README.md
index 2d293206..d0f1f100 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,8 @@ npm install llama.rn
 
 Please re-run `npx pod-install` again.
 
+By default, `llama.rn` will use pre-built `rnllama.xcframework` for iOS. If you want to build from source, please set `RNLLAMA_BUILD_FROM_SOURCE` to `1` in your Podfile.
+
 #### Android
 
 Add proguard rule if it's enabled in project (android/app/proguard-rules.pro):
@@ -27,6 +29,8 @@ Add proguard rule if it's enabled in project (android/app/proguard-rules.pro):
 -keep class com.rnllama.** { *; }
 ```
 
+By default, `llama.rn` will use pre-built libraries for Android. If you want to build from source, please set `rnllamaBuildFromSource` to `true` in `android/gradle.properties`.
+
 ## Obtain the model
 
 You can search HuggingFace for available models (Keyword: [`GGUF`](https://huggingface.co/search/full-text?q=GGUF&type=model)).
@@ -118,29 +122,6 @@ Please visit the [Documentation](docs/API) for more details.
 
 You can also visit the [example](example) to see how to use it.
 
-Run the example:
-
-```bash
-yarn && yarn bootstrap
-
-# iOS
-yarn example ios
-# Use device
-yarn example ios --device "<device name>"
-# With release mode
-yarn example ios --mode Release
-
-# Android
-yarn example android
-# With release mode
-yarn example android --mode release
-```
-
-This example used [react-native-document-picker](https://github.com/rnmods/react-native-document-picker) for select model.
-
-- iOS: You can move the model to iOS Simulator, or iCloud for real device.
-- Android: Selected file will be copied or downloaded to cache directory so it may be slow.
-
 ## Grammar Sampling
 
 GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis.
diff --git a/android/build.gradle b/android/build.gradle
index 7cb93616..fae5d670 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -54,9 +54,18 @@ android {
       }
     }
   }
-  externalNativeBuild {
-    cmake {
-      path = file('src/main/CMakeLists.txt')
+  def rnllamaBuildFromSource = project.properties["rnllamaBuildFromSource"]
+  if (rnllamaBuildFromSource == "true") {
+    externalNativeBuild {
+      cmake {
+        path = file('src/main/CMakeLists.txt')
+      }
+    }
+    // Exclude jniLibs
+    sourceSets {
+      main {
+        jniLibs.srcDirs = []
+      }
     }
   }
   buildTypes {
diff --git a/android/src/main/CMakeLists.txt b/android/src/main/CMakeLists.txt
index f8739d68..2d1fc782 100644
--- a/android/src/main/CMakeLists.txt
+++ b/android/src/main/CMakeLists.txt
@@ -45,9 +45,9 @@ set(
     ${RNLLAMA_LIB_DIR}/unicode.cpp
     ${RNLLAMA_LIB_DIR}/sgemm.cpp
     ${RNLLAMA_LIB_DIR}/common.cpp
-    ${RNLLAMA_LIB_DIR}/rn-llama.hpp
     ${RNLLAMA_LIB_DIR}/amx/amx.cpp
     ${RNLLAMA_LIB_DIR}/amx/mmq.cpp
+    ${RNLLAMA_LIB_DIR}/rn-llama.cpp
     ${CMAKE_SOURCE_DIR}/jni-utils.h
     ${CMAKE_SOURCE_DIR}/jni.cpp
 )
diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp
index ddcc1f66..72a008ea 100644
--- a/android/src/main/jni.cpp
+++ b/android/src/main/jni.cpp
@@ -11,7 +11,7 @@
 #include "llama.h"
 #include "llama-impl.h"
 #include "ggml.h"
-#include "rn-llama.hpp"
+#include "rn-llama.h"
 #include "jni-utils.h"
 
 #define UNUSED(x) (void)(x)
diff --git a/cpp/README.md b/cpp/README.md
index 454426b0..84098c08 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -1,4 +1,4 @@
 # Note
 
-- Only `rn-llama.hpp` is the specific file for this project, others are sync from [llama.cpp](https://github.com/ggerganov/llama.cpp).
+- Only `rn-llama.h` and `rn-llama.cpp` are the specific files for this folder, others are sync from [llama.cpp](https://github.com/ggerganov/llama.cpp).
 - We can update the native source by using the [bootstrap](../scripts/bootstrap.sh) script.
diff --git a/cpp/rn-llama.cpp b/cpp/rn-llama.cpp
new file mode 100644
index 00000000..24da3f17
--- /dev/null
+++ b/cpp/rn-llama.cpp
@@ -0,0 +1,658 @@
+#include "rn-llama.h"
+
+namespace rnllama {
+
+const std::vector<lm_ggml_type> kv_cache_types = {
+    LM_GGML_TYPE_F32,
+    LM_GGML_TYPE_F16,
+    LM_GGML_TYPE_BF16,
+    LM_GGML_TYPE_Q8_0,
+    LM_GGML_TYPE_Q4_0,
+    LM_GGML_TYPE_Q4_1,
+    LM_GGML_TYPE_IQ4_NL,
+    LM_GGML_TYPE_Q5_0,
+    LM_GGML_TYPE_Q5_1,
+};
+
+lm_ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (lm_ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static void llama_batch_clear(llama_batch *batch) {
+    batch->n_tokens = 0;
+}
+
+static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
+    batch->token   [batch->n_tokens] = id;
+    batch->pos     [batch->n_tokens] = pos;
+    batch->n_seq_id[batch->n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); i++) {
+        batch->seq_id[batch->n_tokens][i] = seq_ids[i];
+    }
+    batch->logits  [batch->n_tokens] = logits ? 1 : 0;
+    batch->n_tokens += 1;
+}
+
+// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
+
+static void log(const char *level, const char *function, int line,
+                       const char *format, ...)
+{
+    va_list args;
+    #if defined(__ANDROID__)
+        char prefix[256];
+        snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
+
+        va_start(args, format);
+        android_LogPriority priority;
+        if (strcmp(level, "ERROR") == 0) {
+            priority = ANDROID_LOG_ERROR;
+        } else if (strcmp(level, "WARNING") == 0) {
+            priority = ANDROID_LOG_WARN;
+        } else if (strcmp(level, "INFO") == 0) {
+            priority = ANDROID_LOG_INFO;
+        } else {
+            priority = ANDROID_LOG_DEBUG;
+        }
+        __android_log_vprint(priority, "RNLlama", prefix, args);
+        va_end(args);
+    #else
+        printf("[%s] %s:%d ", level, function, line);
+        va_start(args, format);
+        vprintf(format, args);
+        va_end(args);
+        printf("\n");
+    #endif
+}
+
+#if RNLLAMA_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                       \
+    do                                                              \
+    {                                                               \
+        if (rnllama_verbose)                                        \
+        {                                                           \
+            log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
+        }                                                           \
+    } while (0)
+#endif
+
+#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
+
+static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+    return i;
+}
+
+static bool ends_with(const std::string &str, const std::string &suffix)
+{
+    return str.size() >= suffix.size() &&
+           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
+}
+
+static size_t find_partial_stop_string(const std::string &stop,
+                                       const std::string &text)
+{
+    if (!text.empty() && !stop.empty())
+    {
+        const char text_last_char = text.back();
+        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
+        {
+            if (stop[char_index] == text_last_char)
+            {
+                const std::string current_partial = stop.substr(0, char_index + 1);
+                if (ends_with(text, current_partial))
+                {
+                    return text.size() - char_index - 1;
+                }
+            }
+        }
+    }
+    return std::string::npos;
+}
+
+// format incomplete utf-8 multibyte character for output
+std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
+{
+    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
+    {
+        std::stringstream ss;
+        ss << std::hex << (out[0] & 0xff);
+        std::string res(ss.str());
+        out = "byte: \\x" + res;
+    }
+    return out;
+}
+
+std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end)
+{
+    std::string ret;
+    for (auto it = begin; it != end; ++it)
+    {
+        ret += common_token_to_piece(ctx, *it);
+    }
+    return ret;
+}
+
+llama_rn_context::~llama_rn_context() {
+    if (ctx_sampling != nullptr) {
+        common_sampler_free(ctx_sampling);
+    }
+}
+
+void llama_rn_context::rewind() {
+    is_interrupted = false;
+    params.antiprompt.clear();
+    params.sampling.grammar.clear();
+    num_prompt_tokens = 0;
+    num_tokens_predicted = 0;
+    generated_text = "";
+    generated_text.reserve(params.n_ctx);
+    generated_token_probs.clear();
+    truncated = false;
+    stopped_eos = false;
+    stopped_word = false;
+    stopped_limit = false;
+    stopping_word = "";
+    incomplete = false;
+    n_remain = 0;
+    n_past = 0;
+    params.sampling.n_prev = n_ctx;
+}
+
+bool llama_rn_context::initSampling() {
+    if (ctx_sampling != nullptr) {
+        common_sampler_free(ctx_sampling);
+    }
+    ctx_sampling = common_sampler_init(model, params.sampling);
+    return ctx_sampling != nullptr;
+}
+
+bool llama_rn_context::loadModel(common_params &params_)
+{
+    params = params_;
+    llama_init = common_init_from_params(params);
+    model = llama_init.model.get();
+    ctx = llama_init.context.get();
+    if (model == nullptr)
+    {
+        LOG_ERROR("unable to load model: %s", params_.model.c_str());
+        return false;
+    }
+    n_ctx = llama_n_ctx(ctx);
+
+    // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
+    // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
+
+    return true;
+}
+
+bool llama_rn_context::validateModelChatTemplate() const {
+    const char * tmpl = llama_model_chat_template(model);
+    llama_chat_message chat[] = {{"user", "test"}};
+    int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
+    return chat_res > 0;
+}
+
+void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
+    const int n_left = n_ctx - params.n_keep;
+    const int n_block_size = n_left / 2;
+    const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
+
+    // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
+    std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
+
+    new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
+
+    LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
+        n_ctx,
+        params.n_keep,
+        n_left,
+        tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
+        new_tokens.size()
+    );
+
+    truncated = true;
+    prompt_tokens = new_tokens;
+}
+
+void llama_rn_context::loadPrompt() {
+    std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
+    num_prompt_tokens = prompt_tokens.size();
+
+    // LOG tokens
+    std::stringstream ss;
+    ss << "\n" << __func__ << ": prompt_tokens = ";
+    for (auto& token : prompt_tokens) {
+        ss << token << " ";
+    }
+    LOG_INFO("%s\n", ss.str().c_str());
+
+    if (params.n_keep < 0)
+    {
+        params.n_keep = (int)num_prompt_tokens;
+    }
+    params.n_keep = std::min(n_ctx - 4, params.n_keep);
+
+    // if input prompt is too big, truncate like normal
+    if (num_prompt_tokens >= (size_t) n_ctx)
+    {
+        truncatePrompt(prompt_tokens);
+        num_prompt_tokens = prompt_tokens.size();
+
+        LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
+    }
+    // push the prompt into the sampling context (do not apply grammar)
+    for (auto & token : prompt_tokens)
+    {
+        common_sampler_accept(ctx_sampling, token, false);
+    }
+
+    // compare the evaluated prompt with the new prompt
+    n_past = common_part(embd, prompt_tokens);
+
+    embd = prompt_tokens;
+    if (n_past == num_prompt_tokens)
+    {
+        // we have to evaluate at least 1 token to generate logits.
+        n_past--;
+    }
+
+    // since #3228 we now have to manually manage the KV cache
+    llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
+    LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
+        n_past,
+        tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
+        tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
+    );
+
+    has_next_token = true;
+}
+
+void llama_rn_context::beginCompletion() {
+    // number of tokens to keep when resetting context
+    n_remain = params.n_predict;
+    llama_perf_context_reset(ctx);
+    is_predicting = true;
+}
+
+completion_token_output llama_rn_context::nextToken()
+{
+    completion_token_output result;
+    result.tok = -1;
+
+    if (embd.size() >= (size_t)params.n_ctx)
+    {
+        // Shift context
+
+        const int n_left    = n_past - params.n_keep - 1;
+        const int n_discard = n_left/2;
+
+        llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+        llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+        for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
+        {
+            embd[i - n_discard] = embd[i];
+        }
+        embd.resize(embd.size() - n_discard);
+
+        n_past -= n_discard;
+
+        LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
+            params.n_ctx,
+            params.n_keep,
+            n_left
+        );
+    }
+
+    bool tg = true;
+    while (n_past < embd.size())
+    {
+        int n_eval = (int)embd.size() - n_past;
+        tg = n_eval == 1;
+        if (n_eval > params.n_batch)
+        {
+            n_eval = params.n_batch;
+        }
+        if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
+        {
+            LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
+                n_eval,
+                n_past,
+                params.cpuparams.n_threads,
+                tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
+            );
+            has_next_token = false;
+            return result;
+        }
+        n_past += n_eval;
+
+        if(is_interrupted) {
+            LOG_INFO("Decoding Interrupted");
+            embd.resize(n_past);
+            has_next_token = false;
+            return result;
+        }
+    }
+
+    const llama_vocab* vocab = llama_model_get_vocab(model);
+
+    if (params.n_predict == 0)
+    {
+        has_next_token = false;
+        result.tok = llama_vocab_eos(vocab);
+        return result;
+    }
+
+    {
+        // out of user input, sample next token
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(llama_vocab_n_tokens(vocab));
+
+        result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
+
+        llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
+
+        const int32_t n_probs = params.sampling.n_probs;
+
+        // deprecated
+        /*if (params.sampling.temp <= 0 && n_probs > 0)
+        {
+            // For llama_sample_token_greedy we need to sort candidates
+            llama_sampler_init_softmax();
+
+        }*/
+
+
+        for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+        {
+            result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+        }
+
+        common_sampler_accept(ctx_sampling, result.tok, true);
+        if (tg) {
+            num_tokens_predicted++;
+        }
+    }
+
+    // add it to the context
+    embd.push_back(result.tok);
+    // decrement remaining sampling budget
+    --n_remain;
+
+    if (!embd.empty() && embd.back() == llama_vocab_eos(vocab))
+    {
+        // stopping_word = llama_token_to_piece(ctx, embd.back());
+        has_next_token = false;
+        stopped_eos = true;
+        LOG_VERBOSE("eos token found", "");
+        return result;
+    }
+
+    has_next_token = params.n_predict == -1 || n_remain != 0;
+    return result;
+}
+
+size_t llama_rn_context::findStoppingStrings(const std::string &text, const size_t last_token_size,
+                            const stop_type type)
+{
+    size_t stop_pos = std::string::npos;
+    for (const std::string &word : params.antiprompt)
+    {
+        size_t pos;
+        if (type == STOP_FULL)
+        {
+            const size_t tmp = word.size() + last_token_size;
+            const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
+            pos = text.find(word, from_pos);
+        }
+        else
+        {
+            pos = find_partial_stop_string(word, text);
+        }
+        if (pos != std::string::npos &&
+            (stop_pos == std::string::npos || pos < stop_pos))
+        {
+            if (type == STOP_FULL)
+            {
+                stopping_word = word;
+                stopped_word = true;
+                has_next_token = false;
+            }
+            stop_pos = pos;
+        }
+    }
+    return stop_pos;
+}
+
+completion_token_output llama_rn_context::doCompletion()
+{
+    const completion_token_output token_with_probs = nextToken();
+
+    const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
+    generated_text += token_text;
+
+    if (params.sampling.n_probs > 0)
+    {
+        generated_token_probs.push_back(token_with_probs);
+    }
+
+    // check if there is incomplete UTF-8 character at the end
+    for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
+        unsigned char c = generated_text[generated_text.size() - i];
+        if ((c & 0xC0) == 0x80) {
+            // continuation byte: 10xxxxxx
+            continue;
+        }
+        if ((c & 0xE0) == 0xC0) {
+            // 2-byte character: 110xxxxx ...
+            incomplete = i < 2;
+        } else if ((c & 0xF0) == 0xE0) {
+            // 3-byte character: 1110xxxx ...
+            incomplete = i < 3;
+        } else if ((c & 0xF8) == 0xF0) {
+            // 4-byte character: 11110xxx ...
+            incomplete = i < 4;
+        }
+        // else 1-byte character or invalid byte
+        break;
+    }
+
+    if (incomplete && !has_next_token)
+    {
+        has_next_token = true;
+        n_remain++;
+    }
+
+    if (!has_next_token && n_remain == 0)
+    {
+        stopped_limit = true;
+    }
+
+    LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
+        common_token_to_piece(ctx, token_with_probs.tok),
+        tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
+        has_next_token,
+        n_remain,
+        num_tokens_predicted,
+        stopped_eos,
+        stopped_word,
+        stopped_limit,
+        stopping_word.c_str()
+    );
+    return token_with_probs;
+}
+
+std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
+{
+    static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
+    if (!embd_params.embedding)
+    {
+        LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
+        return std::vector<float>(n_embd, 0.0f);
+    }
+    float *data;
+
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    printf("pooling_type: %d\n", pooling_type);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        data = llama_get_embeddings(ctx);
+    } else {
+        data = llama_get_embeddings_seq(ctx, 0);
+    }
+
+    if (!data) {
+        return std::vector<float>(n_embd, 0.0f);
+    }
+    std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
+    common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize);
+    return out;
+}
+
+std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
+{
+    if (is_predicting) {
+        LOG_ERROR("cannot benchmark while predicting", "");
+        return std::string("[]");
+    }
+
+    is_predicting = true;
+
+    double pp_avg = 0;
+    double tg_avg = 0;
+
+    double pp_std = 0;
+    double tg_std = 0;
+
+    // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
+    llama_batch batch = llama_batch_init(
+        std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
+        0,                         // No embeddings
+        1                          // Single sequence
+    );
+
+    for (int i = 0; i < nr; i++)
+    {
+        llama_batch_clear(&batch);
+
+        const int n_tokens = pp;
+
+        for (int i = 0; i < n_tokens; i++)
+        {
+            llama_batch_add(&batch, 0, i, {0}, false);
+        }
+        batch.logits[batch.n_tokens - 1] = 1; // true
+
+        llama_kv_cache_clear(ctx);
+
+        const int64_t t_pp_start = llama_time_us();
+        if (llama_decode(ctx, batch) != 0)
+        {
+            LOG_ERROR("llama_decode() failed during prompt", "");
+        }
+        const int64_t t_pp_end = llama_time_us();
+        llama_kv_cache_clear(ctx);
+
+        if (is_interrupted) break;
+
+        const int64_t t_tg_start = llama_time_us();
+
+        for (int i = 0; i < tg; i++)
+        {
+            llama_batch_clear(&batch);
+
+            for (int j = 0; j < pl; j++)
+            {
+                llama_batch_add(&batch, 0, i, {j}, true);
+            }
+
+            if (llama_decode(ctx, batch) != 0)
+            {
+                LOG_ERROR("llama_decode() failed during text generation", "");
+            }
+            if (is_interrupted) break;
+        }
+
+        const int64_t t_tg_end = llama_time_us();
+
+        llama_kv_cache_clear(ctx);
+
+        const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
+        const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
+
+        const double speed_pp = pp / t_pp;
+        const double speed_tg = (pl * tg) / t_tg;
+
+        pp_avg += speed_pp;
+        tg_avg += speed_tg;
+
+        pp_std += speed_pp * speed_pp;
+        tg_std += speed_tg * speed_tg;
+    }
+
+    pp_avg /= nr;
+    tg_avg /= nr;
+
+    if (nr > 1) {
+        pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1));
+        tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1));
+    } else {
+        pp_std = 0;
+        tg_std = 0;
+    }
+
+    if (is_interrupted) llama_kv_cache_clear(ctx);
+    is_predicting = false;
+
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    return std::string("[\"") + model_desc + std::string("\",") +
+        std::to_string(llama_model_size(model)) + std::string(",") +
+        std::to_string(llama_model_n_params(model)) + std::string(",") +
+        std::to_string(pp_avg) + std::string(",") +
+        std::to_string(pp_std) + std::string(",") +
+        std::to_string(tg_avg) + std::string(",") +
+        std::to_string(tg_std) +
+        std::string("]");
+}
+
+int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
+    for (auto &la : lora) {
+        la.ptr = llama_adapter_lora_init(model, la.path.c_str());
+        if (la.ptr == nullptr) {
+            LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
+            return -1;
+        }
+    }
+    this->lora = lora;
+    common_set_adapter_lora(ctx, lora);
+    return 0;
+}
+
+void llama_rn_context::removeLoraAdapters() {
+    this->lora.clear();
+    common_set_adapter_lora(ctx, this->lora); // apply empty list
+}
+
+std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
+    return this->lora;
+}
+
+}
diff --git a/cpp/rn-llama.h b/cpp/rn-llama.h
new file mode 100644
index 00000000..0e98998a
--- /dev/null
+++ b/cpp/rn-llama.h
@@ -0,0 +1,119 @@
+#ifndef RNLLAMA_H
+#define RNLLAMA_H
+
+#include <sstream>
+#include <iostream>
+#include "common.h"
+#include "ggml.h"
+#include "gguf.h"
+#include "llama.h"
+#include "llama-impl.h"
+#include "sampling.h"
+#if defined(__ANDROID__)
+#include <android/log.h>
+#endif
+
+namespace rnllama {
+
+std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
+
+std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
+
+lm_ggml_type kv_cache_type_from_str(const std::string & s);
+
+enum stop_type
+{
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+};
+
+// Main context class
+struct llama_rn_context {
+    bool is_predicting = false;
+    bool is_interrupted = false;
+    bool has_next_token = false;
+    std::string generated_text;
+    std::vector<completion_token_output> generated_token_probs;
+
+    size_t num_prompt_tokens = 0;
+    size_t num_tokens_predicted = 0;
+    size_t n_past = 0;
+    size_t n_remain = 0;
+
+    std::vector<llama_token> embd;
+    common_params params;
+    common_init_result llama_init;
+
+    llama_model *model = nullptr;
+    float loading_progress = 0;
+    bool is_load_interrupted = false;
+
+    llama_context *ctx = nullptr;
+    common_sampler *ctx_sampling = nullptr;
+
+    int n_ctx;
+
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+    std::string stopping_word;
+    bool incomplete = false;
+
+    std::vector<common_adapter_lora_info> lora;
+
+    ~llama_rn_context();
+
+    void rewind();
+    bool initSampling();
+    bool loadModel(common_params &params_);
+    bool validateModelChatTemplate() const;
+    void truncatePrompt(std::vector<llama_token> &prompt_tokens);
+    void loadPrompt();
+    void beginCompletion();
+    completion_token_output nextToken();
+    size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
+    completion_token_output doCompletion();
+    std::vector<float> getEmbedding(common_params &embd_params);
+    std::string bench(int pp, int tg, int pl, int nr);
+    int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
+    void removeLoraAdapters();
+    std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
+};\
+
+// Logging macros
+extern bool rnllama_verbose;
+
+#if RNLLAMA_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                       \
+    do                                                              \
+    {                                                               \
+        if (rnllama_verbose)                                        \
+        {                                                           \
+            log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
+        }                                                           \
+    } while (0)
+#endif
+
+#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
+
+} // namespace rnllama
+
+#endif /* RNLLAMA_H */
diff --git a/cpp/rn-llama.hpp b/cpp/rn-llama.hpp
deleted file mode 100644
index a386cf1e..00000000
--- a/cpp/rn-llama.hpp
+++ /dev/null
@@ -1,737 +0,0 @@
-#ifndef RNLLAMA_H
-#define RNLLAMA_H
-
-#include <sstream>
-#include <iostream>
-#include "common.h"
-#include "ggml.h"
-#include "gguf.h"
-#include "llama.h"
-#include "llama-impl.h"
-#include "sampling.h"
-#if defined(__ANDROID__)
-#include <android/log.h>
-#endif
-
-namespace rnllama {
-
-const std::vector<lm_ggml_type> kv_cache_types = {
-    LM_GGML_TYPE_F32,
-    LM_GGML_TYPE_F16,
-    LM_GGML_TYPE_BF16,
-    LM_GGML_TYPE_Q8_0,
-    LM_GGML_TYPE_Q4_0,
-    LM_GGML_TYPE_Q4_1,
-    LM_GGML_TYPE_IQ4_NL,
-    LM_GGML_TYPE_Q5_0,
-    LM_GGML_TYPE_Q5_1,
-};
-
-static lm_ggml_type kv_cache_type_from_str(const std::string & s) {
-    for (const auto & type : kv_cache_types) {
-        if (lm_ggml_type_name(type) == s) {
-            return type;
-        }
-    }
-    throw std::runtime_error("Unsupported cache type: " + s);
-}
-
-static void llama_batch_clear(llama_batch *batch) {
-    batch->n_tokens = 0;
-}
-
-static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
-    batch->token   [batch->n_tokens] = id;
-    batch->pos     [batch->n_tokens] = pos;
-    batch->n_seq_id[batch->n_tokens] = seq_ids.size();
-    for (size_t i = 0; i < seq_ids.size(); i++) {
-        batch->seq_id[batch->n_tokens][i] = seq_ids[i];
-    }
-    batch->logits  [batch->n_tokens] = logits ? 1 : 0;
-    batch->n_tokens += 1;
-}
-
-// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
-
-static void log(const char *level, const char *function, int line,
-                       const char *format, ...)
-{
-    va_list args;
-    #if defined(__ANDROID__)
-        char prefix[256];
-        snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
-        
-        va_start(args, format);
-        android_LogPriority priority;
-        if (strcmp(level, "ERROR") == 0) {
-            priority = ANDROID_LOG_ERROR;
-        } else if (strcmp(level, "WARNING") == 0) {
-            priority = ANDROID_LOG_WARN;
-        } else if (strcmp(level, "INFO") == 0) {
-            priority = ANDROID_LOG_INFO;
-        } else {
-            priority = ANDROID_LOG_DEBUG;
-        }
-        __android_log_vprint(priority, "RNLlama", prefix, args);
-        va_end(args);
-    #else
-        printf("[%s] %s:%d ", level, function, line);
-        va_start(args, format);
-        vprintf(format, args);
-        va_end(args);
-        printf("\n");
-    #endif
-}
-static bool rnllama_verbose = false;
-
-#if RNLLAMA_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                       \
-    do                                                              \
-    {                                                               \
-        if (rnllama_verbose)                                        \
-        {                                                           \
-            log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
-        }                                                           \
-    } while (0)
-#endif
-
-#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
-#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
-#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
-
-enum stop_type
-{
-    STOP_FULL,
-    STOP_PARTIAL,
-};
-
-// completion token output with probabilities
-struct completion_token_output
-{
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-};
-
-static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
-{
-    size_t i;
-    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
-    {
-    }
-    return i;
-}
-
-static bool ends_with(const std::string &str, const std::string &suffix)
-{
-    return str.size() >= suffix.size() &&
-           0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string &stop,
-                                       const std::string &text)
-{
-    if (!text.empty() && !stop.empty())
-    {
-        const char text_last_char = text.back();
-        for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
-        {
-            if (stop[char_index] == text_last_char)
-            {
-                const std::string current_partial = stop.substr(0, char_index + 1);
-                if (ends_with(text, current_partial))
-                {
-                    return text.size() - char_index - 1;
-                }
-            }
-        }
-    }
-    return std::string::npos;
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
-{
-    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
-    // if the size is 1 and first bit is 1, meaning it's a partial character
-    //   (size > 1 meaning it's already a known token)
-    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
-    {
-        std::stringstream ss;
-        ss << std::hex << (out[0] & 0xff);
-        std::string res(ss.str());
-        out = "byte: \\x" + res;
-    }
-    return out;
-}
-
-template <class Iter>
-static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
-{
-    std::string ret;
-    for (; begin != end; ++begin)
-    {
-        ret += common_token_to_piece(ctx, *begin);
-    }
-    return ret;
-}
-
-struct llama_rn_context
-{
-    bool is_predicting = false;
-    bool is_interrupted = false;
-    bool has_next_token = false;
-    std::string generated_text;
-    std::vector<completion_token_output> generated_token_probs;
-
-    size_t num_prompt_tokens = 0;
-    size_t num_tokens_predicted = 0;
-    size_t n_past = 0;
-    size_t n_remain = 0;
-
-    std::vector<llama_token> embd;
-
-    common_params params;
-
-    common_init_result llama_init;
-
-    llama_model *model = nullptr;
-    float loading_progress = 0;
-    bool is_load_interrupted = false;
-
-    llama_context *ctx = nullptr;
-    common_sampler *ctx_sampling = nullptr;
-
-    int n_ctx;
-
-    bool truncated = false;
-    bool stopped_eos = false;
-    bool stopped_word = false;
-    bool stopped_limit = false;
-    std::string stopping_word;
-    bool incomplete = false;
-
-    std::vector<common_adapter_lora_info> lora;
-
-    ~llama_rn_context()
-    {
-        if (ctx_sampling != nullptr)
-        {
-            common_sampler_free(ctx_sampling);
-        }
-    }
-
-    void rewind()
-    {
-        is_interrupted = false;
-        params.antiprompt.clear();
-        params.sampling.grammar.clear();
-        num_prompt_tokens = 0;
-        num_tokens_predicted = 0;
-        generated_text = "";
-        generated_text.reserve(params.n_ctx);
-        generated_token_probs.clear();
-        truncated = false;
-        stopped_eos = false;
-        stopped_word = false;
-        stopped_limit = false;
-        stopping_word = "";
-        incomplete = false;
-        n_remain = 0;
-        n_past = 0;
-        params.sampling.n_prev = n_ctx;
-    }
-
-    bool initSampling() {
-        if (ctx_sampling != nullptr) {
-            common_sampler_free(ctx_sampling);
-        }
-        ctx_sampling = common_sampler_init(model, params.sampling);
-        return ctx_sampling != nullptr;
-    }
-
-    bool loadModel(common_params &params_)
-    {
-        params = params_;
-        llama_init = common_init_from_params(params);
-        model = llama_init.model.get();
-        ctx = llama_init.context.get();
-        if (model == nullptr)
-        {
-           LOG_ERROR("unable to load model: %s", params_.model.c_str());
-           return false;
-        }
-        n_ctx = llama_n_ctx(ctx);
-
-        // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
-        // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
-       
-        return true;
-    }
-
-    bool validateModelChatTemplate() const {
-        const char * tmpl = llama_model_chat_template(model);
-        llama_chat_message chat[] = {{"user", "test"}};
-        int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
-        return chat_res > 0;
-    }
-
-    void truncatePrompt(std::vector<llama_token> &prompt_tokens) {
-        const int n_left = n_ctx - params.n_keep;
-        const int n_block_size = n_left / 2;
-        const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
-
-        // Keep n_keep tokens at start of prompt (at most n_ctx - 4)
-        std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-
-        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-
-        LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
-            n_ctx,
-            params.n_keep,
-            n_left,
-            tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
-            new_tokens.size()
-        );
-
-        truncated = true;
-        prompt_tokens = new_tokens;
-    }
-
-    void loadPrompt()
-    {
-        std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
-        num_prompt_tokens = prompt_tokens.size();
-
-        // LOG tokens
-        std::stringstream ss;
-        ss << "\n" << __func__ << ": prompt_tokens = ";
-        for (auto& token : prompt_tokens) {
-            ss << token << " ";
-        }
-        LOG_INFO("%s\n", ss.str().c_str());
-
-        if (params.n_keep < 0)
-        {
-            params.n_keep = (int)num_prompt_tokens;
-        }
-        params.n_keep = std::min(n_ctx - 4, params.n_keep);
-
-        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t) n_ctx)
-        {
-            truncatePrompt(prompt_tokens);
-            num_prompt_tokens = prompt_tokens.size();
-
-            LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
-        }
-        // push the prompt into the sampling context (do not apply grammar)
-        for (auto & token : prompt_tokens)
-        {
-           common_sampler_accept(ctx_sampling, token, false);
-        }
-
-        // compare the evaluated prompt with the new prompt
-        n_past = common_part(embd, prompt_tokens);
-
-        embd = prompt_tokens;
-        if (n_past == num_prompt_tokens)
-        {
-            // we have to evaluate at least 1 token to generate logits.
-            n_past--;
-        }
-
-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
-            n_past,
-            tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
-            tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
-        );
-
-        has_next_token = true;
-    }
-
-    void beginCompletion()
-    {
-        // number of tokens to keep when resetting context
-        n_remain = params.n_predict;
-        llama_perf_context_reset(ctx);
-        is_predicting = true;
-    }
-
-    completion_token_output nextToken()
-    {
-        completion_token_output result;
-        result.tok = -1;
-
-        if (embd.size() >= (size_t)params.n_ctx)
-        {
-            // Shift context
-
-            const int n_left    = n_past - params.n_keep - 1;
-            const int n_discard = n_left/2;
-
-            llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-            llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
-            {
-                embd[i - n_discard] = embd[i];
-            }
-            embd.resize(embd.size() - n_discard);
-
-            n_past -= n_discard;
-
-            LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
-                params.n_ctx,
-                params.n_keep,
-                n_left
-            );
-        }
-
-        bool tg = true;
-        while (n_past < embd.size())
-        {
-            int n_eval = (int)embd.size() - n_past;
-            tg = n_eval == 1;
-            if (n_eval > params.n_batch)
-            {
-                n_eval = params.n_batch;
-            }
-            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
-            {
-                LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
-                    n_eval,
-                    n_past,
-                    params.cpuparams.n_threads,
-                    tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
-                );
-                has_next_token = false;
-                return result;
-            }
-            n_past += n_eval;
-
-            if(is_interrupted) {
-                LOG_INFO("Decoding Interrupted");
-                embd.resize(n_past);
-                has_next_token = false;
-                return result;
-            }
-        }
-
-        const llama_vocab* vocab = llama_model_get_vocab(model);
-
-        if (params.n_predict == 0)
-        {
-            has_next_token = false;
-            result.tok = llama_vocab_eos(vocab);
-            return result;
-        }
-
-        {
-            // out of user input, sample next token
-            std::vector<llama_token_data> candidates;
-            candidates.reserve(llama_vocab_n_tokens(vocab));
-
-            result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
-
-            llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
-
-            const int32_t n_probs = params.sampling.n_probs;
-
-            // deprecated
-            /*if (params.sampling.temp <= 0 && n_probs > 0)
-            {
-                // For llama_sample_token_greedy we need to sort candidates
-                llama_sampler_init_softmax();
-
-            }*/
-
-
-            for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
-            {
-                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
-            }
-
-            common_sampler_accept(ctx_sampling, result.tok, true);
-            if (tg) {
-                num_tokens_predicted++;
-            }
-        }
-
-        // add it to the context
-        embd.push_back(result.tok);
-        // decrement remaining sampling budget
-        --n_remain;
-
-        if (!embd.empty() && embd.back() == llama_vocab_eos(vocab))
-        {
-            // stopping_word = llama_token_to_piece(ctx, embd.back());
-            has_next_token = false;
-            stopped_eos = true;
-            LOG_VERBOSE("eos token found", "");
-            return result;
-        }
-
-        has_next_token = params.n_predict == -1 || n_remain != 0;
-        return result;
-    }
-
-    size_t findStoppingStrings(const std::string &text, const size_t last_token_size,
-                               const stop_type type)
-    {
-        size_t stop_pos = std::string::npos;
-        for (const std::string &word : params.antiprompt)
-        {
-            size_t pos;
-            if (type == STOP_FULL)
-            {
-                const size_t tmp = word.size() + last_token_size;
-                const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-                pos = text.find(word, from_pos);
-            }
-            else
-            {
-                pos = find_partial_stop_string(word, text);
-            }
-            if (pos != std::string::npos &&
-                (stop_pos == std::string::npos || pos < stop_pos))
-            {
-                if (type == STOP_FULL)
-                {
-                    stopping_word = word;
-                    stopped_word = true;
-                    has_next_token = false;
-                }
-                stop_pos = pos;
-            }
-        }
-        return stop_pos;
-    }
-
-    completion_token_output doCompletion()
-    {
-        const completion_token_output token_with_probs = nextToken();
-
-        const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
-        generated_text += token_text;
-
-        if (params.sampling.n_probs > 0)
-        {
-            generated_token_probs.push_back(token_with_probs);
-        }
-
-        // check if there is incomplete UTF-8 character at the end
-        for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
-            unsigned char c = generated_text[generated_text.size() - i];
-            if ((c & 0xC0) == 0x80) {
-                // continuation byte: 10xxxxxx
-                continue;
-            }
-            if ((c & 0xE0) == 0xC0) {
-                // 2-byte character: 110xxxxx ...
-                incomplete = i < 2;
-            } else if ((c & 0xF0) == 0xE0) {
-                // 3-byte character: 1110xxxx ...
-                incomplete = i < 3;
-            } else if ((c & 0xF8) == 0xF0) {
-                // 4-byte character: 11110xxx ...
-                incomplete = i < 4;
-            }
-            // else 1-byte character or invalid byte
-            break;
-        }
-
-        if (incomplete && !has_next_token)
-        {
-            has_next_token = true;
-            n_remain++;
-        }
-
-        if (!has_next_token && n_remain == 0)
-        {
-            stopped_limit = true;
-        }
-
-        LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
-            common_token_to_piece(ctx, token_with_probs.tok),
-            tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
-            has_next_token,
-            n_remain,
-            num_tokens_predicted,
-            stopped_eos,
-            stopped_word,
-            stopped_limit,
-            stopping_word.c_str()
-        );
-        return token_with_probs;
-    }
-
-    std::vector<float> getEmbedding(common_params &embd_params)
-    {
-        static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
-        if (!embd_params.embedding)
-        {
-            LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
-            return std::vector<float>(n_embd, 0.0f);
-        }
-        float *data;
-
-        const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-        printf("pooling_type: %d\n", pooling_type);
-        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-            data = llama_get_embeddings(ctx);
-        } else {
-            data = llama_get_embeddings_seq(ctx, 0);
-        }
-
-        if (!data) {
-            return std::vector<float>(n_embd, 0.0f);
-        }
-        std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
-        common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize);
-        return out;
-    }
-
-    std::string bench(int pp, int tg, int pl, int nr)
-    {
-        if (is_predicting) {
-            LOG_ERROR("cannot benchmark while predicting", "");
-            return std::string("[]");
-        }
-
-        is_predicting = true;
-
-        double pp_avg = 0;
-        double tg_avg = 0;
-
-        double pp_std = 0;
-        double tg_std = 0;
-
-        // TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
-        llama_batch batch = llama_batch_init(
-            std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
-            0,                         // No embeddings
-            1                          // Single sequence
-        );
-
-        for (int i = 0; i < nr; i++)
-        {
-            llama_batch_clear(&batch);
-
-            const int n_tokens = pp;
-
-            for (int i = 0; i < n_tokens; i++)
-            {
-                llama_batch_add(&batch, 0, i, {0}, false);
-            }
-            batch.logits[batch.n_tokens - 1] = 1; // true
-
-            llama_kv_cache_clear(ctx);
-
-            const int64_t t_pp_start = llama_time_us();
-            if (llama_decode(ctx, batch) != 0)
-            {
-                LOG_ERROR("llama_decode() failed during prompt", "");
-            }
-            const int64_t t_pp_end = llama_time_us();
-            llama_kv_cache_clear(ctx);
-
-            if (is_interrupted) break;
-
-            const int64_t t_tg_start = llama_time_us();
-
-            for (int i = 0; i < tg; i++)
-            {
-                llama_batch_clear(&batch);
-
-                for (int j = 0; j < pl; j++)
-                {
-                    llama_batch_add(&batch, 0, i, {j}, true);
-                }
-
-                if (llama_decode(ctx, batch) != 0)
-                {
-                    LOG_ERROR("llama_decode() failed during text generation", "");
-                }
-                if (is_interrupted) break;
-            }
-
-            const int64_t t_tg_end = llama_time_us();
-
-            llama_kv_cache_clear(ctx);
-
-            const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
-            const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
-
-            const double speed_pp = pp / t_pp;
-            const double speed_tg = (pl * tg) / t_tg;
-
-            pp_avg += speed_pp;
-            tg_avg += speed_tg;
-
-            pp_std += speed_pp * speed_pp;
-            tg_std += speed_tg * speed_tg;
-        }
-
-        pp_avg /= nr;
-        tg_avg /= nr;
-
-        if (nr > 1) {
-            pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1));
-            tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1));
-        } else {
-            pp_std = 0;
-            tg_std = 0;
-        }
-
-        if (is_interrupted) llama_kv_cache_clear(ctx);
-        is_predicting = false;
-
-        char model_desc[128];
-        llama_model_desc(model, model_desc, sizeof(model_desc));
-        return std::string("[\"") + model_desc + std::string("\",") +
-            std::to_string(llama_model_size(model)) + std::string(",") +
-            std::to_string(llama_model_n_params(model)) + std::string(",") +
-            std::to_string(pp_avg) + std::string(",") +
-            std::to_string(pp_std) + std::string(",") +
-            std::to_string(tg_avg) + std::string(",") +
-            std::to_string(tg_std) +
-            std::string("]");
-    }
-
-    int applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
-        for (auto &la : lora) {
-            la.ptr = llama_adapter_lora_init(model, la.path.c_str());
-            if (la.ptr == nullptr) {
-                LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
-                return -1;
-            }
-        }
-        this->lora = lora;
-        common_set_adapter_lora(ctx, lora);
-        return 0;
-    }
-
-    void removeLoraAdapters() {
-        this->lora.clear();
-        common_set_adapter_lora(ctx, this->lora); // apply empty list
-    }
-
-    std::vector<common_adapter_lora_info> getLoadedLoraAdapters() {
-        return this->lora;
-    }
-};
-
-}
-
-#endif /* LLAMA_H */
diff --git a/example/README.md b/example/README.md
new file mode 100644
index 00000000..0d9d6e10
--- /dev/null
+++ b/example/README.md
@@ -0,0 +1,63 @@
+# llama.rn example
+
+This is an example of how to use the llama.rn library.
+
+This example used [react-native-document-picker](https://github.com/rnmods/react-native-document-picker) for select model.
+
+- iOS: You can move the model to iOS Simulator, or iCloud for real device.
+- Android: Selected file will be copied or downloaded to cache directory so it may be slow.
+
+## Requirements
+
+Please back to the root directory and run the following command:
+
+```bash
+yarn && yarn bootstrap
+```
+
+## iOS
+
+1. Install pods
+
+```bash
+yarn pods
+```
+
+2. Run the example
+
+```bash
+yarn ios
+# Use device
+yarn ios --device "<device name>"
+# With release mode
+yarn ios --mode Release
+```
+
+## Android
+
+Run the example:
+```bash
+yarn android
+# With release mode
+yarn android --mode release
+```
+
+## Build with frameworks/libs
+
+This example is build llama.rn from source code by default, you can also build with frameworks/libs.
+
+```bash
+# Build iOS frameworks
+yarn build:ios-frameworks
+# Build Android libs
+yarn build:android-libs
+```
+
+Then you can setup the environment variable / properties in your project:
+
+iOS:
+```bash
+RNLLAMA_BUILD_FROM_SOURCE=0 yarn pods
+```
+
+Android: Edit `android/gradle.properties` and set `rnllamaBuildFromSource` to `false`.
diff --git a/example/android/gradle.properties b/example/android/gradle.properties
index e1ddc51c..6947ba4b 100644
--- a/example/android/gradle.properties
+++ b/example/android/gradle.properties
@@ -42,3 +42,7 @@ newArchEnabled=true
 # Use this property to enable or disable the Hermes JS engine.
 # If set to false, you will be using JSC instead.
 hermesEnabled=true
+
+# Use this property to enable or disable the RNLLAMA build from source.
+# If set to true, the RNLLAMA library will be built from source.
+rnllamaBuildFromSource=true
diff --git a/example/ios/.xcode.env.local b/example/ios/.xcode.env.local
index 7793e02a..ed82726d 100644
--- a/example/ios/.xcode.env.local
+++ b/example/ios/.xcode.env.local
@@ -1 +1 @@
-export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1737601800519-0.4653948355776887/node
+export NODE_BINARY=/var/folders/4z/1d45cfts3936kdm7v9jl349r0000gn/T/yarn--1737683137807-0.2592527908357527/node
diff --git a/example/ios/Podfile b/example/ios/Podfile
index 011b69e6..657296c3 100644
--- a/example/ios/Podfile
+++ b/example/ios/Podfile
@@ -18,6 +18,8 @@ end
 
 ENV['RCT_NEW_ARCH_ENABLED'] = '1'
 
+ENV['RNLLAMA_BUILD_FROM_SOURCE'] = ENV['RNLLAMA_BUILD_FROM_SOURCE'] || '1'
+
 target 'RNLlamaExample' do
   config = use_native_modules!
 
diff --git a/example/ios/Podfile.lock b/example/ios/Podfile.lock
index 78d05732..7efe8517 100644
--- a/example/ios/Podfile.lock
+++ b/example/ios/Podfile.lock
@@ -1270,7 +1270,7 @@ SPEC CHECKSUMS:
   glog: 04b94705f318337d7ead9e6d17c019bd9b1f6b1b
   hermes-engine: 10fbd3f62405c41ea07e71973ea61e1878d07322
   libevent: 4049cae6c81cdb3654a443be001fb9bdceff7913
-  llama-rn: 1510d60298c2a0dcec8ff2922b1689daea663d9e
+  llama-rn: c90e797ab8ba372d7c8c47b2d7703b0a6c45d048
   RCT-Folly: 424b8c9a7a0b9ab2886ffe9c3b041ef628fd4fb1
   RCTRequired: a2faf4bad4e438ca37b2040cb8f7799baa065c18
   RCTTypeSafety: cb09f3e4747b6d18331a15eb05271de7441ca0b3
@@ -1314,6 +1314,6 @@ SPEC CHECKSUMS:
   SocketRocket: f32cd54efbe0f095c4d7594881e52619cfe80b17
   Yoga: 8796b55dba14d7004f980b54bcc9833ee45b28ce
 
-PODFILE CHECKSUM: cf6cc1e14840d7bc13c75e90df0f55023f1d4ad2
+PODFILE CHECKSUM: 65536592e23bc5ecf357f53e2deda450e33f1c90
 
 COCOAPODS: 1.15.2
diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt
new file mode 100644
index 00000000..eb52f746
--- /dev/null
+++ b/ios/CMakeLists.txt
@@ -0,0 +1,99 @@
+cmake_minimum_required(VERSION 3.16)
+project(rnllama VERSION 1.0.0 LANGUAGES CXX C)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# iOS specific settings
+set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
+set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE NO)
+
+# Dependencies and compile options
+add_definitions(
+    -DNDEBUG
+    -DO3
+    -DLM_GGML_USE_CPU
+    -DLM_GGML_USE_ACCELERATE
+    -DLM_GGML_USE_METAL
+)
+
+set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp)
+
+# Define public headers
+set(PUBLIC_HEADERS
+    ${SOURCE_DIR}/rn-llama.h
+    ${SOURCE_DIR}/llama.h
+    ${SOURCE_DIR}/llama-impl.h
+    ${SOURCE_DIR}/ggml.h
+)
+
+# Create library target
+add_library(rnllama SHARED
+    ${SOURCE_DIR}/ggml.c
+    ${SOURCE_DIR}/ggml-alloc.c
+    ${SOURCE_DIR}/ggml-backend.cpp
+    ${SOURCE_DIR}/ggml-backend-reg.cpp
+    ${SOURCE_DIR}/ggml-cpu.c
+    ${SOURCE_DIR}/ggml-cpu.cpp
+    ${SOURCE_DIR}/ggml-cpu-aarch64.cpp
+    ${SOURCE_DIR}/ggml-cpu-quants.c
+    ${SOURCE_DIR}/ggml-cpu-traits.cpp
+    ${SOURCE_DIR}/ggml-metal.m
+    ${SOURCE_DIR}/ggml-opt.cpp
+    ${SOURCE_DIR}/ggml-threading.cpp
+    ${SOURCE_DIR}/ggml-quants.c
+    ${SOURCE_DIR}/gguf.cpp
+    ${SOURCE_DIR}/log.cpp
+    ${SOURCE_DIR}/llama-impl.cpp
+    ${SOURCE_DIR}/llama-grammar.cpp
+    ${SOURCE_DIR}/llama-sampling.cpp
+    ${SOURCE_DIR}/llama-vocab.cpp
+    ${SOURCE_DIR}/llama-adapter.cpp
+    ${SOURCE_DIR}/llama-chat.cpp
+    ${SOURCE_DIR}/llama-context.cpp
+    ${SOURCE_DIR}/llama-kv-cache.cpp
+    ${SOURCE_DIR}/llama-arch.cpp
+    ${SOURCE_DIR}/llama-batch.cpp
+    ${SOURCE_DIR}/llama-cparams.cpp
+    ${SOURCE_DIR}/llama-hparams.cpp
+    ${SOURCE_DIR}/llama.cpp
+    ${SOURCE_DIR}/llama-model.cpp
+    ${SOURCE_DIR}/llama-model-loader.cpp
+    ${SOURCE_DIR}/llama-mmap.cpp
+    ${SOURCE_DIR}/llama-vocab.cpp
+    ${SOURCE_DIR}/sampling.cpp
+    ${SOURCE_DIR}/unicode-data.cpp
+    ${SOURCE_DIR}/unicode.cpp
+    ${SOURCE_DIR}/sgemm.cpp
+    ${SOURCE_DIR}/common.cpp
+    ${SOURCE_DIR}/amx/amx.cpp
+    ${SOURCE_DIR}/amx/mmq.cpp
+    ${SOURCE_DIR}/rn-llama.cpp
+)
+
+# Setup include directories
+target_include_directories(rnllama
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
+        $<INSTALL_INTERFACE:include>
+)
+
+# Link required frameworks
+target_link_libraries(rnllama PRIVATE
+    "-framework Accelerate"
+    "-framework Foundation"
+    "-framework Metal"
+    "-framework MetalKit"
+)
+
+# Set properties for framework
+set_target_properties(rnllama PROPERTIES
+    MACOSX_FRAMEWORK_IDENTIFIER "com.rnllama"
+    MACOSX_FRAMEWORK_BUNDLE_VERSION 1.0.0
+    MACOSX_FRAMEWORK_SHORT_VERSION_STRING 1.0.0
+    FRAMEWORK TRUE
+    FRAMEWORK_VERSION 1.0.0
+    VERSION 1.0.0
+    PUBLIC_HEADER "${PUBLIC_HEADERS}"
+    XCODE_ATTRIBUTE_CLANG_ENABLE_OBJC_ARC NO
+)
diff --git a/ios/RNLlama.h b/ios/RNLlama.h
index f52638ef..e58cf614 100644
--- a/ios/RNLlama.h
+++ b/ios/RNLlama.h
@@ -1,5 +1,9 @@
 #ifdef __cplusplus
-#import "rn-llama.hpp"
+#if RNLLAMA_BUILD_FROM_SOURCE
+#import "rn-llama.h"
+#else
+#import <rnllama/rn-llama.h>
+#endif
 #endif
 
 #import <React/RCTEventEmitter.h>
diff --git a/ios/RNLlamaContext.h b/ios/RNLlamaContext.h
index 82bcccda..153715e4 100644
--- a/ios/RNLlamaContext.h
+++ b/ios/RNLlamaContext.h
@@ -1,8 +1,15 @@
 #ifdef __cplusplus
+#if RNLLAMA_BUILD_FROM_SOURCE
 #import "llama.h"
 #import "llama-impl.h"
 #import "ggml.h"
-#import "rn-llama.hpp"
+#import "rn-llama.h"
+#else
+#import <rnllama/llama.h>
+#import <rnllama/llama-impl.h>
+#import <rnllama/ggml.h>
+#import <rnllama/rn-llama.h>
+#endif
 #endif
 
 
diff --git a/ios/rnllama.xcframework/Info.plist b/ios/rnllama.xcframework/Info.plist
new file mode 100644
index 00000000..91b8b440
--- /dev/null
+++ b/ios/rnllama.xcframework/Info.plist
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>AvailableLibraries</key>
+	<array>
+		<dict>
+			<key>LibraryIdentifier</key>
+			<string>ios-arm64</string>
+			<key>LibraryPath</key>
+			<string>rnllama.framework</string>
+			<key>SupportedArchitectures</key>
+			<array>
+				<string>arm64</string>
+			</array>
+			<key>SupportedPlatform</key>
+			<string>ios</string>
+		</dict>
+		<dict>
+			<key>LibraryIdentifier</key>
+			<string>ios-arm64_x86_64-simulator</string>
+			<key>LibraryPath</key>
+			<string>rnllama.framework</string>
+			<key>SupportedArchitectures</key>
+			<array>
+				<string>arm64</string>
+				<string>x86_64</string>
+			</array>
+			<key>SupportedPlatform</key>
+			<string>ios</string>
+			<key>SupportedPlatformVariant</key>
+			<string>simulator</string>
+		</dict>
+		<dict>
+			<key>LibraryIdentifier</key>
+			<string>tvos-arm64</string>
+			<key>LibraryPath</key>
+			<string>rnllama.framework</string>
+			<key>SupportedArchitectures</key>
+			<array>
+				<string>arm64</string>
+			</array>
+			<key>SupportedPlatform</key>
+			<string>tvos</string>
+		</dict>
+		<dict>
+			<key>LibraryIdentifier</key>
+			<string>tvos-arm64_x86_64-simulator</string>
+			<key>LibraryPath</key>
+			<string>rnllama.framework</string>
+			<key>SupportedArchitectures</key>
+			<array>
+				<string>arm64</string>
+				<string>x86_64</string>
+			</array>
+			<key>SupportedPlatform</key>
+			<string>tvos</string>
+			<key>SupportedPlatformVariant</key>
+			<string>simulator</string>
+		</dict>
+
+	</array>
+	<key>CFBundlePackageType</key>
+	<string>XFWK</string>
+	<key>XCFrameworkFormatVersion</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1.0.0</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0.0</string>
+	<key>CFBundleIdentifier</key>
+	<string>com.rnllama</string>
+</dict>
+</plist>
diff --git a/llama-rn.podspec b/llama-rn.podspec
index 596a6e08..b97a7dfa 100644
--- a/llama-rn.podspec
+++ b/llama-rn.podspec
@@ -23,8 +23,14 @@ Pod::Spec.new do |s|
   s.platforms    = { :ios => "13.0", :tvos => "13.0" }
   s.source       = { :git => "https://github.com/mybigday/llama.rn.git", :tag => "#{s.version}" }
 
-  s.source_files = "ios/**/*.{h,m,mm}", "cpp/**/*.{h,cpp,hpp,c,m,mm}"
-  s.resources = "cpp/**/*.{metallib}"
+  if ENV["RNLLAMA_BUILD_FROM_SOURCE"] == "1"
+    s.source_files = "ios/**/*.{h,m,mm}", "cpp/**/*.{h,cpp,hpp,c,m,mm}"
+    s.resources = "cpp/**/*.{metallib}"
+    base_compiler_flags += " -DRNLLAMA_BUILD_FROM_SOURCE"
+  else
+    s.source_files = "ios/**/*.{h,m,mm}"
+    s.vendored_frameworks = "ios/rnllama.xcframework"
+  end
 
   s.dependency "React-Core"
 
diff --git a/package.json b/package.json
index f86e3a7c..ac1a80c2 100644
--- a/package.json
+++ b/package.json
@@ -33,10 +33,12 @@
     "test": "jest",
     "typecheck": "tsc --noEmit",
     "lint": "eslint \"**/*.{js,ts,tsx}\"",
-    "prepack": "bob build",
+    "prepack": "./scripts/build-ios.sh && ./scripts/build-android.sh && bob build",
     "release": "release-it",
     "example": "yarn --cwd example",
+    "build:ios-frameworks": "./scripts/build-ios.sh",
     "build:ios": "cd example/ios && xcodebuild -workspace RNLlamaExample.xcworkspace -scheme RNLlamaExample -configuration Debug -sdk iphonesimulator CC=clang CPLUSPLUS=clang++ LD=clang LDPLUSPLUS=clang++ GCC_OPTIMIZATION_LEVEL=0 GCC_PRECOMPILE_PREFIX_HEADER=YES ASSETCATALOG_COMPILER_OPTIMIZATION=time DEBUG_INFORMATION_FORMAT=dwarf COMPILER_INDEX_STORE_ENABLE=NO",
+    "build:android-libs": "./scripts/build-android.sh",
     "build:android": "cd example/android && ./gradlew assembleDebug",
     "clean": "del-cli example/ios/build"
   },
diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh
index ca53a128..b6a983c4 100755
--- a/scripts/bootstrap.sh
+++ b/scripts/bootstrap.sh
@@ -227,12 +227,12 @@ patch -p0 -d ./cpp < ./scripts/ggml-quants.c.patch
 patch -p0 -d ./cpp < ./scripts/llama-mmap.cpp.patch
 
 if [ "$OS" = "Darwin" ]; then
-  # Build metallib (~1.4MB)
+  # Build metallib (~2.6MB)
   cd llama.cpp/ggml/src/ggml-metal
   xcrun --sdk iphoneos metal -c ggml-metal.metal -o ggml-metal.air
   xcrun --sdk iphoneos metallib ggml-metal.air   -o ggml-llama.metallib
   rm ggml-metal.air
-  cp ./ggml-llama.metallib ../../../../cpp/ggml-llama.metallib
+  mv ./ggml-llama.metallib ../../../../cpp/ggml-llama.metallib
 
   cd -
 
diff --git a/scripts/build-android.sh b/scripts/build-android.sh
new file mode 100755
index 00000000..74048039
--- /dev/null
+++ b/scripts/build-android.sh
@@ -0,0 +1,40 @@
+#! /bin/bash
+
+NDK_VERSION=26.1.10909125
+CMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/ndk/$NDK_VERSION/build/cmake/android.toolchain.cmake
+ANDROID_PLATFORM=android-21
+CMAKE_BUILD_TYPE=Release
+
+cd android/src/main
+
+# Build the Android library (arm64-v8a)
+cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_TOOLCHAIN_FILE \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=$ANDROID_PLATFORM \
+  -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE \
+  -B build-arm64
+
+cmake --build build-arm64 --config Release
+
+mkdir -p jniLibs/arm64-v8a
+
+# Copy the library to the example app
+cp build-arm64/*.so jniLibs/arm64-v8a/
+
+rm -rf build-arm64
+
+# Build the Android library (x86_64)
+cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_TOOLCHAIN_FILE \
+  -DANDROID_ABI=x86_64 \
+  -DANDROID_PLATFORM=$ANDROID_PLATFORM \
+  -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE \
+  -B build-x86_64
+
+cmake --build build-x86_64 --config Release
+
+mkdir -p jniLibs/x86_64
+
+# Copy the library to the example app
+cp build-x86_64/*.so jniLibs/x86_64/
+
+rm -rf build-x86_64
diff --git a/scripts/build-ios.sh b/scripts/build-ios.sh
new file mode 100755
index 00000000..acdffa82
--- /dev/null
+++ b/scripts/build-ios.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+function cp_headers() {
+  mkdir -p ../ios/rnllama.xcframework/$1/rnllama.framework/Headers
+  cp ../cpp/*.h ../ios/rnllama.xcframework/$1/rnllama.framework/Headers/
+}
+
+function build_framework() {
+  # Parameters:
+  # $1: system_name (iOS/tvOS)
+  # $2: architectures
+  # $3: sysroot
+  # $4: output_path
+  # $5: build_dir
+
+  cd $5
+
+  # Configure CMake
+  cmake ../ios \
+    -GXcode \
+    -DCMAKE_SYSTEM_NAME=$1 \
+    -DCMAKE_OSX_ARCHITECTURES="$2" \
+    -DCMAKE_OSX_SYSROOT=$3 \
+    -DCMAKE_INSTALL_PREFIX=`pwd`/install \
+    -DCMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH=NO \
+    -DCMAKE_IOS_INSTALL_COMBINED=YES
+
+  # Build
+  cmake --build . --config Release
+
+  # Setup framework directory
+  rm -rf ../ios/rnllama.xcframework/$4
+  mkdir -p ../ios/rnllama.xcframework/$4
+  mv Release-$3/rnllama.framework ../ios/rnllama.xcframework/$4/rnllama.framework
+  mkdir -p ../ios/rnllama.xcframework/$4/rnllama.framework/Headers
+
+  # Copy headers and metallib
+  cp_headers $4
+
+  # TODO: May need to re-build metallib for tvOS
+  cp ../cpp/ggml-llama.metallib ../ios/rnllama.xcframework/$4/rnllama.framework/ggml-llama.metallib
+
+  rm -rf ./*
+}
+
+rm -rf build-ios
+mkdir -p build-ios
+
+# Build iOS frameworks
+build_framework "iOS" "arm64;x86_64" "iphonesimulator" "ios-arm64_x86_64-simulator" "build-ios"
+build_framework "iOS" "arm64" "iphoneos" "ios-arm64" "build-ios"
+
+cd ..
+rm -rf build-ios
+
+rm -rf build-tvos
+mkdir -p build-tvos
+
+# Build tvOS frameworks
+build_framework "tvOS" "arm64;x86_64" "appletvsimulator" "tvos-arm64_x86_64-simulator" "build-tvos"
+build_framework "tvOS" "arm64" "appletvos" "tvos-arm64" "build-tvos"
+
+cd ..
+rm -rf build-tvos