flexflow · goliaro · Feb 26, 2025 · Feb 26, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -16,4 +16,7 @@
 [submodule "deps/tokenizers-cpp"]
 	path = deps/tokenizers-cpp
 	url = https://github.com/mlc-ai/tokenizers-cpp.git
-	fetchRecurseSubmodules = true
+	fetchRecurseSubmodules = true
+[submodule "deps/flashinfer"]
+	path = deps/flashinfer
+	url = https://github.com/flashinfer-ai/flashinfer.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -197,6 +197,9 @@ include(variant)
 # optional
 include(optional)
 
+# flashinfer
+list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)
+
 if (FF_GPU_BACKEND STREQUAL "cuda")
   list(APPEND FF_CC_FLAGS
     -DFF_USE_CUDA)
@@ -220,7 +223,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
   list(APPEND FF_CC_FLAGS
     -DFF_DEBUG)
   list(APPEND FF_NVCC_FLAGS
-    -DFF_DEBUG)
+    -DFF_DEBUG -lineinfo)
 endif()
 
 message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")

diff --git a/deps/flashinfer b/deps/flashinfer
diff --git a/include/flexflow/attention_config.h b/include/flexflow/attention_config.h
@@ -0,0 +1,203 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
+#define _FLEXFLOW_ATTENTION_CONFIG_H_
+#include "flexflow/batch_config.h"
+
+namespace FlexFlow {
+
+constexpr uint32_t kPagesize = 64;
+
+inline int round_up_pages(int const num_elements) {
+  return (num_elements + kPagesize - 1) / kPagesize;
+}
+
+#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...)                              \
+  switch (head_dim) {                                                          \
+    case 64: {                                                                 \
+      constexpr size_t HEAD_DIM = 64;                                          \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 128: {                                                                \
+      constexpr size_t HEAD_DIM = 128;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    case 256: {                                                                \
+      constexpr size_t HEAD_DIM = 256;                                         \
+      __VA_ARGS__                                                              \
+      break;                                                                   \
+    }                                                                          \
+    default: {                                                                 \
+      std::ostringstream err_msg;                                              \
+      err_msg << "Unsupported head_dim: " << head_dim;                         \
+      throw std::invalid_argument(err_msg.str());                              \
+    }                                                                          \
+  }
+
+class AttentionMetaData {
+public:
+  AttentionMetaData() {
+    num_q_heads_ = 0;
+    num_kv_heads_ = 0;
+    head_dim_ = 0;
+    q_indptr = nullptr;
+    kv_indptr = nullptr;
+    kv_indices = nullptr;
+    kv_last_page_len = nullptr;
+    qk_indptr = nullptr;
+    custom_mask = nullptr;
+    workspace = nullptr;
+    workspace_size = 0;
+    float_workspace = nullptr;
+    float_workspace_size = 0;
+    int_workspace = nullptr;
+    int_workspace_size = 0;
+    mem_size_ = 0;
+    enabled_ = false;
+  }
+  AttentionMetaData(AttentionMetaData const &rhs) {
+    num_q_heads_ = rhs.num_q_heads_;
+    num_kv_heads_ = rhs.num_kv_heads_;
+    head_dim_ = rhs.head_dim_;
+    q_indptr = rhs.q_indptr;
+    kv_indptr = rhs.kv_indptr;
+    kv_indices = rhs.kv_indices;
+    kv_last_page_len = rhs.kv_last_page_len;
+    qk_indptr = rhs.qk_indptr;
+    custom_mask = rhs.custom_mask;
+    workspace = rhs.workspace;
+    workspace_size = rhs.workspace_size;
+    float_workspace = rhs.float_workspace;
+    float_workspace_size = rhs.float_workspace_size;
+    int_workspace = rhs.int_workspace;
+    int_workspace_size = rhs.int_workspace_size;
+    mem_size_ = rhs.mem_size_;
+    enabled_ = rhs.enabled_;
+    decode_handler_collections = rhs.decode_handler_collections;
+    prompt_handler_collections = rhs.prompt_handler_collections;
+  }
+
+  size_t mem_size() {
+    if (mem_size_ > 0) {
+      return mem_size_;
+    }
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = 0;
+
+    float_workspace_size = 128 * 1024 * 1024; // 128 MB
+    int_workspace_size = 8 * 1024 * 1024;     // 8 MB
+    workspace_size =
+        float_workspace_size + int_workspace_size; // float + int workspace
+
+    mem_size_ = alignTo(sizeof(int32_t) * indices_size +
+                            sizeof(uint8_t) * custom_mask_size + workspace_size,
+                        16);
+    return mem_size_;
+  }
+
+  void assign_address(void *ptr, int size) {
+    if (ptr == nullptr) {
+      q_indptr = nullptr;
+      kv_indptr = nullptr;
+      kv_indices = nullptr;
+      kv_last_page_len = nullptr;
+      qk_indptr = nullptr;
+      custom_mask = nullptr;
+      workspace = nullptr;
+      float_workspace = nullptr;
+      int_workspace = nullptr;
+      return;
+    }
+    assert(size >= mem_size() &&
+           "Insufficient memory size for attention metadata");
+    size_t batch_size = BatchConfig::max_requests_per_batch();
+    size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
+    size_t indices_size = std::max(
+        (batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
+    size_t custom_mask_size = 0;
+
+    q_indptr = static_cast<int32_t *>(ptr);
+    kv_indptr = q_indptr + batch_size + 1;
+    kv_indices = kv_indptr + batch_size + 1;
+    kv_last_page_len = kv_indices + max_num_pages * batch_size;
+    qk_indptr = kv_last_page_len + batch_size + 1;
+    custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
+    workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
+                                    sizeof(int32_t) * indices_size +
+                                    sizeof(uint8_t) * custom_mask_size);
+    float_workspace = workspace;
+    int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
+                                        float_workspace_size);
+  }
+
+  void set_num_q_heads(uint32_t const num_q_heads) {
+    num_q_heads_ = num_q_heads;
+  }
+  void set_num_kv_heads(uint32_t const num_kv_heads) {
+    num_kv_heads_ = num_kv_heads;
+  }
+  void set_head_dim(uint32_t const head_dim) {
+    head_dim_ = head_dim;
+  }
+  uint32_t num_q_heads() const {
+    return num_q_heads_;
+  }
+  uint32_t num_kv_heads() const {
+    return num_kv_heads_;
+  }
+  uint32_t head_dim() const {
+    return head_dim_;
+  }
+
+  void set_enabled(bool const enabled) {
+    enabled_ = enabled;
+  }
+  bool enabled() const {
+    return enabled_;
+  }
+
+  uint32_t num_q_heads_;
+  uint32_t num_kv_heads_;
+  uint32_t head_dim_;
+
+  int32_t *q_indptr;
+  int32_t *kv_indptr;
+  int32_t *kv_indices;
+  int32_t *kv_last_page_len;
+  int32_t *qk_indptr;
+  uint8_t *custom_mask;
+  void *workspace;
+  size_t workspace_size;
+  void *float_workspace;
+  size_t float_workspace_size;
+  void *int_workspace;
+  size_t int_workspace_size;
+
+  size_t mem_size_;
+
+  // batchsize -> handler
+  bool enabled_;
+  std::unordered_map<int, void *> decode_handler_collections;
+  std::unordered_map<int, void *> prompt_handler_collections;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -29,6 +29,10 @@
 
 namespace FlexFlow {
 
+inline int alignTo(int x, int y) {
+  return ((x + y - 1) / y) * y;
+}
+
 class InferenceResult;
 class BeamInferenceResult;
 
@@ -71,6 +75,10 @@ class BatchConfig {
   static int max_verify_tokens_per_batch();
   static int max_spec_tree_token_num();
   static int max_sequence_length();
+
+  // paged attention
+  static size_t max_kv_cache_size();
+
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;
   void save_to_file(std::string const &filename) const;
@@ -111,6 +119,15 @@ class BatchConfig {
     int num_tokens_in_batch;
     int max_length;
 
+    // paged attention
+    static constexpr size_t request_guid_size = sizeof(RequestGuid);
+    static constexpr size_t alignment = 16;
+    static constexpr size_t padding_size =
+        (alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
+        alignment;
+    static constexpr size_t padding_length = padding_size / sizeof(int);
+    int padding[padding_length] = {}; // Padding for memory pointer alignment
+
     // request id in batch config:
     int batch_config_request_id = -1;
     bool prompt_phase = false;

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -16,6 +16,7 @@
 #ifndef _FLEXFLOW_CONFIG_H_
 #define _FLEXFLOW_CONFIG_H_
 #include "ffconst.h"
+#include "flexflow/attention_config.h"
 #include "flexflow/batch_config.h"
 #include "legion.h"
 #include <cstring>
@@ -89,14 +90,21 @@ struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnHandle_t dnn;
   cublasHandle_t blas;
+  cudnnHandle_t peft_dnn;
+  cublasHandle_t peft_blas;
 #else
   miopenHandle_t dnn;
   hipblasHandle_t blas;
+  miopenHandle_t peft_dnn;
+  hipblasHandle_t peft_blas;
 #endif
   void *workSpace;
   size_t workSpaceSize;
   CombinedBatchConfigMetaStruct *batch_config_metadata;
 
+  // flashinfer
+  AttentionMetaData *incr_attention_metadata;
+
   // request info + token info + topolopgy mask info
   size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
@@ -106,6 +114,7 @@ struct FFHandler {
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
   ncclComm_t ncclComm;
+  ncclComm_t ncclCommPeft;
 #endif
 };
 

diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
@@ -656,6 +656,11 @@ flexflow_perf_metrics_t
 
 void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
+void flexflow_model_set_num_transformer_layers(flexflow_model_t handle_, int num_layers);
+void flexflow_model_set_num_kv_heads(flexflow_model_t handle_, int num_kv_heads);
+void flexflow_model_set_qkv_dim(flexflow_model_t handle_, int qkv_dim);
+void flexflow_model_set_size_dt(flexflow_model_t handle_, long unsigned int size_dt);
+
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
                              enum RequestType *request_types,
@@ -1033,6 +1038,10 @@ void flexflow_request_manager_set_max_sequence_length(
 int flexflow_request_manager_get_max_sequence_length(
     flexflow_request_manager_t handle_);
 
+// paged attention
+void flexflow_request_manager_set_max_kv_cache_size(
+    flexflow_request_manager_t handle_, int max_kv_cache_size);
+
 void flexflow_request_manager_set_max_concurrent_adapters(
     flexflow_request_manager_t handle_, int max_concurrent_adapters);
 

diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h
@@ -96,6 +96,7 @@ struct ParallelConfig {
   int device_ids[MAX_NUM_WORKERS];
 #ifdef FF_USE_NCCL
   ncclComm_t nccl_comms[MAX_NUM_WORKERS];
+  ncclComm_t nccl_comms_peft[MAX_NUM_WORKERS];
 #endif
 };
 

diff --git a/include/flexflow/model.h b/include/flexflow/model.h
@@ -1095,6 +1095,13 @@ class FFModel {
                CompMode comp_mode = COMP_MODE_TRAINING);
   void compile_inference();
   void set_transformer_layer_id(int id);
+
+  // paged attention
+  void set_num_transformer_layers(int num_layers);
+  void set_num_kv_heads(int num_heads);
+  void set_qkv_dim(int qkv_dim);
+  void set_size_dt(int size_dt);
+
   void set_position_offset(int offset);
   void graph_optimize(size_t budget,
                       bool only_data_parallel,
@@ -1114,6 +1121,7 @@ class FFModel {
                      bool use_propagation) const;
 #ifdef FF_USE_NCCL
   ncclComm_t *find_nccl_comms(MachineView const &view) const;
+  ncclComm_t *find_nccl_comms_peft(MachineView const &view) const;
   void finish_nccl_comms();
 #endif
 #ifdef FF_USE_PROPAGATE
@@ -1158,6 +1166,13 @@ class FFModel {
   size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
+
+  // paged attention
+  int num_transformer_layers;
+  int num_kv_heads;
+  int qkv_dim;
+  int size_dt;
+
   // positional embedding start offset
   int position_offset;
   FFConfig config;
@@ -1305,6 +1320,7 @@ class FFModel {
                 // inference_debugging mode.
 #ifdef FF_USE_NCCL
   std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
+  std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms_peft;
 #endif
 private:
   bool debug;