From 6262e240b5d4c01f9d8fb2a5f9002f30f0f4ceb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81ngel=20Gonz=C3=A1lez=20Santamarta?=
 <mgons@unileon.es>
Date: Tue, 7 Jan 2025 12:46:49 +0100
Subject: [PATCH] Qwen2-VL support added

---
 llama_bringup/models/Qwen2-VL.yaml    | 18 +++++
 llama_ros/include/llama_ros/llama.hpp |  7 +-
 llama_ros/include/llava_ros/llava.hpp |  3 +
 llama_ros/src/llama_ros/llama.cpp     | 86 +++++++++++++-----------
 llama_ros/src/llava_ros/llava.cpp     | 97 +++++++++++++++++++++++++--
 5 files changed, 161 insertions(+), 50 deletions(-)
 create mode 100644 llama_bringup/models/Qwen2-VL.yaml
diff --git a/llama_bringup/models/Qwen2-VL.yaml b/llama_bringup/models/Qwen2-VL.yaml
new file mode 100644
index 0000000..6fb7355
--- /dev/null
+++ b/llama_bringup/models/Qwen2-VL.yaml
@@ -0,0 +1,18 @@
+use_llava: True
+
+n_ctx: 8192
+n_batch: 512
+n_gpu_layers: 15
+n_threads: -1
+n_predict: 8192
+
+model_repo: "bartowski/Qwen2-VL-2B-Instruct-GGUF"
+model_filename: "Qwen2-VL-2B-Instruct-IQ2_M.gguf"
+
+mmproj_repo: "bartowski/Qwen2-VL-2B-Instruct-GGUF"
+mmproj_filename: "mmproj-Qwen2-VL-2B-Instruct-f16.gguf"
+
+image_prefix: "<|vision_start|>"
+image_suffix: "<|vision_end|>"
+
+system_prompt_type: "ChatML"
diff --git a/llama_ros/include/llama_ros/llama.hpp b/llama_ros/include/llama_ros/llama.hpp
index 2526be2..31a34c3 100644
--- a/llama_ros/include/llama_ros/llama.hpp
+++ b/llama_ros/include/llama_ros/llama.hpp
@@ -169,14 +169,15 @@ using GenerateResponseCallback = std::function<void(struct CompletionOutput)>;
 class Llama {
 
 public:
-  Llama(const struct common_params &params, std::string system_prompt = "");
+  Llama(const struct common_params &params, std::string system_prompt = "",
+        bool initial_reset = true);
   virtual ~Llama();
 
   std::vector<llama_token> tokenize(const std::string &text, bool add_bos,
                                     bool special = false);
   std::string detokenize(const std::vector<llama_token> &tokens);
 
-  void reset();
+  virtual void reset();
   void cancel();
 
   std::string format_chat_prompt(std::vector<struct common_chat_msg> chat_msgs,
@@ -266,7 +267,7 @@ class Llama {
   virtual bool eval_prompt();
   bool eval_prompt(std::vector<llama_token> prompt_tokens);
   bool eval_token(llama_token token);
-  bool eval(std::vector<llama_token> tokens);
+  virtual bool eval(std::vector<llama_token> tokens);
   bool eval(struct llama_batch batch);
 
   std::vector<struct TokenProb> get_probs();
diff --git a/llama_ros/include/llava_ros/llava.hpp b/llama_ros/include/llava_ros/llava.hpp
index c6f4146..e3d078f 100644
--- a/llama_ros/include/llava_ros/llava.hpp
+++ b/llama_ros/include/llava_ros/llava.hpp
@@ -50,6 +50,7 @@ class Llava : public llama_ros::Llama {
         const struct LlavaParams &llava_params, std::string system_prompt = "");
   ~Llava();
 
+  void reset() override;
   bool load_image(std::string base64_str);
   struct llava_image_embed *
   base64_image_to_embed(const std::string &base64_str);
@@ -59,6 +60,7 @@ class Llava : public llama_ros::Llama {
                    bool add_sfx) override;
   bool eval_image(struct llava_image_embed *image_embed);
   bool eval_prompt();
+  bool eval(std::vector<llama_token> tokens) override;
 
   struct llava_image_embed *image_embed;
   struct clip_ctx *ctx_clip;
@@ -67,6 +69,7 @@ class Llava : public llama_ros::Llama {
 private:
   void free_image();
   int image_pose;
+  int st_pos_id;
 };
 
 } // namespace llava_ros
diff --git a/llama_ros/src/llama_ros/llama.cpp b/llama_ros/src/llama_ros/llama.cpp
index e4ea27b..6b8f0ba 100644
--- a/llama_ros/src/llama_ros/llama.cpp
+++ b/llama_ros/src/llama_ros/llama.cpp
@@ -34,7 +34,8 @@
 
 using namespace llama_ros;
 
-Llama::Llama(const struct common_params &params, std::string system_prompt)
+Llama::Llama(const struct common_params &params, std::string system_prompt,
+             bool initial_reset)
     : params(params), system_prompt(system_prompt) {
 
   print_build_info();
@@ -100,7 +101,9 @@ Llama::Llama(const struct common_params &params, std::string system_prompt)
   }
 
   // set inital values
-  this->reset();
+  if (initial_reset) {
+    this->reset();
+  }
 
   // show info
   LLAMA_LOG_INFO("llama.cpp: build = %d, commit = %s", LLAMA_BUILD_NUMBER,
@@ -148,6 +151,38 @@ Llama::~Llama() {
   this->threadpool_batch = nullptr;
 }
 
+/*
+*****************************
+*           RESET           *
+*           CANCEL          *
+*****************************
+*/
+void Llama::reset() {
+
+  llama_kv_cache_clear(this->ctx);
+
+  if (this->sampler != nullptr) {
+    common_sampler_reset(this->sampler);
+  }
+
+  this->canceled = false;
+  this->n_past = 0;
+  this->n_consumed = 0;
+  this->ga_i = 0;
+
+  this->prompt_tokens.clear();
+
+  // load system prompt
+  if (!this->eval_system_prompt()) {
+    LLAMA_LOG_ERROR("Failed to eval system prompt");
+  }
+
+  // number of tokens to keep when resetting context
+  if (this->params.n_keep < 0) {
+    this->params.n_keep = (int)this->prompt_tokens.size();
+  }
+}
+
 /*
 *****************************
 *          METADATA         *
@@ -339,38 +374,6 @@ struct Metadata Llama::get_metadata() {
   return metadata;
 }
 
-/*
-*****************************
-*           RESET           *
-*           CANCEL          *
-*****************************
-*/
-void Llama::reset() {
-
-  llama_kv_cache_clear(this->ctx);
-
-  if (this->sampler != nullptr) {
-    common_sampler_reset(this->sampler);
-  }
-
-  this->canceled = false;
-  this->n_past = 0;
-  this->n_consumed = 0;
-  this->ga_i = 0;
-
-  this->prompt_tokens.clear();
-
-  // load system prompt
-  if (!this->eval_system_prompt()) {
-    LLAMA_LOG_ERROR("Failed to eval system prompt");
-  }
-
-  // number of tokens to keep when resetting context
-  if (this->params.n_keep < 0) {
-    this->params.n_keep = (int)this->prompt_tokens.size();
-  }
-}
-
 /*
 *****************************
 *          TOKENIZE         *
@@ -911,6 +914,7 @@ bool Llama::eval_prompt() { return this->eval_prompt(this->prompt_tokens); }
 bool Llama::eval_prompt(std::vector<llama_token> prompt_tokens) {
 
   std::vector<llama_token> batch;
+  batch.reserve(this->params.n_batch);
 
   while (((int)prompt_tokens.size() > this->n_consumed)) {
 
@@ -941,13 +945,13 @@ bool Llama::eval(std::vector<llama_token> tokens) {
 
   // create batch
   struct llama_batch batch = {
-      int32_t(tokens.size()),
-      tokens.data(),
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
-      nullptr,
+      int32_t(tokens.size()), // n_tokens
+      tokens.data(),          // tokens
+      nullptr,                // embd
+      nullptr,                // pos
+      nullptr,                // n_seq_id
+      nullptr,                // seq_id
+      nullptr,                // logits
   };
 
   return this->eval(batch);
diff --git a/llama_ros/src/llava_ros/llava.cpp b/llama_ros/src/llava_ros/llava.cpp
index 97e4bbb..2ed7055 100644
--- a/llama_ros/src/llava_ros/llava.cpp
+++ b/llama_ros/src/llava_ros/llava.cpp
@@ -34,19 +34,32 @@ using namespace llava_ros;
 
 Llava::Llava(const struct common_params &params,
              const struct LlavaParams &llava_params, std::string system_prompt)
-    : llama_ros::Llama(params, system_prompt), llava_params(llava_params) {
+    : llama_ros::Llama(params, system_prompt, false),
+      llava_params(llava_params), image_pose(0), st_pos_id(-1) {
 
   // load clip model
   const char *clip_path = this->params.mmproj.c_str();
   this->ctx_clip = clip_model_load(clip_path, 1);
   this->image_embed = nullptr;
+
+  // set inital values
+  this->reset();
 }
 
 Llava::~Llava() {
+  this->image_pose = 0;
+  this->st_pos_id = -1;
   clip_free(this->ctx_clip);
   this->free_image();
 }
 
+void Llava::reset() {
+  this->image_pose = 0;
+  this->st_pos_id = -1;
+  this->free_image();
+  Llama::reset();
+}
+
 /*
 *****************************
 *        LOAD IMAGE         *
@@ -150,13 +163,40 @@ bool Llava::eval_image(struct llava_image_embed *image_embed) {
   int n_embd = this->get_n_embd();
   bool succ = true;
 
-  for (int i = 0; i < image_embed->n_image_pos; i += this->params.n_batch) {
+  // for qwen2-vl
+  auto img_tokens = image_embed->n_image_pos;
+
+  std::vector<llama_pos> mrope_pos;
+  mrope_pos.resize(img_tokens * 4);
+
+  std::vector<llama_pos> batch_mrope_pos;
+  batch_mrope_pos.resize(img_tokens * 4);
 
-    int n_eval = image_embed->n_image_pos - i;
+  // fill mrope if qwen2-vl
+  if (clip_is_qwen2vl(this->ctx_clip)) {
+    auto image_size = clip_get_load_image_size(this->ctx_clip);
+    const int patch_size = 14 * 2;
 
-    if (n_eval > this->params.n_batch) {
-      n_eval = this->params.n_batch;
+    const int ph =
+        image_size->height / patch_size + (image_size->height % patch_size > 0);
+    const int pw =
+        image_size->width / patch_size + (image_size->width % patch_size > 0);
+
+    for (int y = 0; y < ph; y++) {
+      for (int x = 0; x < pw; x++) {
+        int i = y * pw + x;
+        mrope_pos[i] = this->st_pos_id;
+        mrope_pos[i + img_tokens] = this->st_pos_id + y;
+        mrope_pos[i + img_tokens * 2] = this->st_pos_id + x;
+        mrope_pos[i + img_tokens * 3] = 0;
+      }
     }
+    this->st_pos_id += std::max(pw, ph);
+  }
+
+  for (int i = 0; i < image_embed->n_image_pos; i += this->params.n_batch) {
+
+    int n_eval = std::min(this->params.n_batch, image_embed->n_image_pos - i);
 
     struct llama_batch batch = {
         int32_t(n_eval),                   // n_tokens
@@ -168,7 +208,19 @@ bool Llava::eval_image(struct llava_image_embed *image_embed) {
         nullptr                            // logits
     };
 
-    if (!this->eval(batch)) {
+    if (clip_is_qwen2vl(this->ctx_clip)) {
+      std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
+      memcpy(batch_mrope_pos.data(), &mrope_pos[i], n_eval * sizeof(llama_pos));
+      memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + i],
+             n_eval * sizeof(llama_pos));
+      memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + i],
+             n_eval * sizeof(llama_pos));
+      memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + i],
+             n_eval * sizeof(llama_pos));
+      batch.pos = batch_mrope_pos.data();
+    }
+
+    if (!Llama::eval(batch)) {
       LLAMA_LOG_ERROR("Failed in image eval");
       succ = false;
       break;
@@ -212,3 +264,36 @@ bool Llava::eval_prompt() {
 
   return true;
 }
+
+bool Llava::eval(std::vector<llama_token> tokens) {
+
+  std::vector<llama_pos> pos;
+
+  // create batch
+  struct llama_batch batch = {
+      int32_t(tokens.size()), // n_tokens
+      tokens.data(),          // tokens
+      nullptr,                // embd
+      nullptr,                // pos
+      nullptr,                // n_seq_id
+      nullptr,                // seq_id
+      nullptr,                // logits
+  };
+
+  if (clip_is_qwen2vl(this->ctx_clip)) {
+    pos.resize(batch.n_tokens * 4);
+    std::fill(pos.begin(), pos.end(), 0);
+    for (int j = 0; j < batch.n_tokens * 3; j++) {
+      pos[j] = this->st_pos_id + (j % batch.n_tokens);
+    }
+    batch.pos = pos.data();
+  }
+
+  if (!Llama::eval(batch)) {
+    return false;
+  }
+
+  this->st_pos_id += batch.n_tokens;
+
+  return true;
+}