Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FlexLLM (part 4) #107

Draft
wants to merge 7 commits into
base: inference
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@
[submodule "deps/tokenizers-cpp"]
path = deps/tokenizers-cpp
url = https://github.com/mlc-ai/tokenizers-cpp.git
fetchRecurseSubmodules = true
fetchRecurseSubmodules = true
[submodule "deps/flashinfer"]
path = deps/flashinfer
url = https://github.com/flashinfer-ai/flashinfer.git
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ include(variant)
# optional
include(optional)

# flashinfer
list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/flashinfer/include)

if (FF_GPU_BACKEND STREQUAL "cuda")
list(APPEND FF_CC_FLAGS
-DFF_USE_CUDA)
Expand All @@ -220,7 +223,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND FF_CC_FLAGS
-DFF_DEBUG)
list(APPEND FF_NVCC_FLAGS
-DFF_DEBUG)
-DFF_DEBUG -lineinfo)
endif()

message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
Expand Down
1 change: 1 addition & 0 deletions deps/flashinfer
Submodule flashinfer added at be6bf5
203 changes: 203 additions & 0 deletions include/flexflow/attention_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef _FLEXFLOW_ATTENTION_CONFIG_H_
#define _FLEXFLOW_ATTENTION_CONFIG_H_
#include "flexflow/batch_config.h"

namespace FlexFlow {

constexpr uint32_t kPagesize = 64;

inline int round_up_pages(int const num_elements) {
return (num_elements + kPagesize - 1) / kPagesize;
}

#define DISPATCH_HEADDIM(head_dim, HEAD_DIM, ...) \
switch (head_dim) { \
case 64: { \
constexpr size_t HEAD_DIM = 64; \
__VA_ARGS__ \
break; \
} \
case 128: { \
constexpr size_t HEAD_DIM = 128; \
__VA_ARGS__ \
break; \
} \
case 256: { \
constexpr size_t HEAD_DIM = 256; \
__VA_ARGS__ \
break; \
} \
default: { \
std::ostringstream err_msg; \
err_msg << "Unsupported head_dim: " << head_dim; \
throw std::invalid_argument(err_msg.str()); \
} \
}

class AttentionMetaData {
public:
AttentionMetaData() {
num_q_heads_ = 0;
num_kv_heads_ = 0;
head_dim_ = 0;
q_indptr = nullptr;
kv_indptr = nullptr;
kv_indices = nullptr;
kv_last_page_len = nullptr;
qk_indptr = nullptr;
custom_mask = nullptr;
workspace = nullptr;
workspace_size = 0;
float_workspace = nullptr;
float_workspace_size = 0;
int_workspace = nullptr;
int_workspace_size = 0;
mem_size_ = 0;
enabled_ = false;
}
AttentionMetaData(AttentionMetaData const &rhs) {
num_q_heads_ = rhs.num_q_heads_;
num_kv_heads_ = rhs.num_kv_heads_;
head_dim_ = rhs.head_dim_;
q_indptr = rhs.q_indptr;
kv_indptr = rhs.kv_indptr;
kv_indices = rhs.kv_indices;
kv_last_page_len = rhs.kv_last_page_len;
qk_indptr = rhs.qk_indptr;
custom_mask = rhs.custom_mask;
workspace = rhs.workspace;
workspace_size = rhs.workspace_size;
float_workspace = rhs.float_workspace;
float_workspace_size = rhs.float_workspace_size;
int_workspace = rhs.int_workspace;
int_workspace_size = rhs.int_workspace_size;
mem_size_ = rhs.mem_size_;
enabled_ = rhs.enabled_;
decode_handler_collections = rhs.decode_handler_collections;
prompt_handler_collections = rhs.prompt_handler_collections;
}

size_t mem_size() {
if (mem_size_ > 0) {
return mem_size_;
}
size_t batch_size = BatchConfig::max_requests_per_batch();
size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
size_t indices_size = std::max(
(batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
size_t custom_mask_size = 0;

float_workspace_size = 128 * 1024 * 1024; // 128 MB
int_workspace_size = 8 * 1024 * 1024; // 8 MB
workspace_size =
float_workspace_size + int_workspace_size; // float + int workspace

mem_size_ = alignTo(sizeof(int32_t) * indices_size +
sizeof(uint8_t) * custom_mask_size + workspace_size,
16);
return mem_size_;
}

void assign_address(void *ptr, int size) {
if (ptr == nullptr) {
q_indptr = nullptr;
kv_indptr = nullptr;
kv_indices = nullptr;
kv_last_page_len = nullptr;
qk_indptr = nullptr;
custom_mask = nullptr;
workspace = nullptr;
float_workspace = nullptr;
int_workspace = nullptr;
return;
}
assert(size >= mem_size() &&
"Insufficient memory size for attention metadata");
size_t batch_size = BatchConfig::max_requests_per_batch();
size_t max_num_pages = round_up_pages(BatchConfig::max_sequence_length());
size_t indices_size = std::max(
(batch_size + 1) * 4 + max_num_pages * batch_size, 1ul * 1024 * 1024);
size_t custom_mask_size = 0;

q_indptr = static_cast<int32_t *>(ptr);
kv_indptr = q_indptr + batch_size + 1;
kv_indices = kv_indptr + batch_size + 1;
kv_last_page_len = kv_indices + max_num_pages * batch_size;
qk_indptr = kv_last_page_len + batch_size + 1;
custom_mask = static_cast<uint8_t *>(ptr) + sizeof(int32_t) * indices_size;
workspace = static_cast<void *>(static_cast<uint8_t *>(ptr) +
sizeof(int32_t) * indices_size +
sizeof(uint8_t) * custom_mask_size);
float_workspace = workspace;
int_workspace = static_cast<void *>(static_cast<uint8_t *>(workspace) +
float_workspace_size);
}

void set_num_q_heads(uint32_t const num_q_heads) {
num_q_heads_ = num_q_heads;
}
void set_num_kv_heads(uint32_t const num_kv_heads) {
num_kv_heads_ = num_kv_heads;
}
void set_head_dim(uint32_t const head_dim) {
head_dim_ = head_dim;
}
uint32_t num_q_heads() const {
return num_q_heads_;
}
uint32_t num_kv_heads() const {
return num_kv_heads_;
}
uint32_t head_dim() const {
return head_dim_;
}

void set_enabled(bool const enabled) {
enabled_ = enabled;
}
bool enabled() const {
return enabled_;
}

uint32_t num_q_heads_;
uint32_t num_kv_heads_;
uint32_t head_dim_;

int32_t *q_indptr;
int32_t *kv_indptr;
int32_t *kv_indices;
int32_t *kv_last_page_len;
int32_t *qk_indptr;
uint8_t *custom_mask;
void *workspace;
size_t workspace_size;
void *float_workspace;
size_t float_workspace_size;
void *int_workspace;
size_t int_workspace_size;

size_t mem_size_;

// batchsize -> handler
bool enabled_;
std::unordered_map<int, void *> decode_handler_collections;
std::unordered_map<int, void *> prompt_handler_collections;
};
} // namespace FlexFlow

#endif // _FLEXFLOW_ATTENTION_CONFIG_H_
17 changes: 17 additions & 0 deletions include/flexflow/batch_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@

namespace FlexFlow {

inline int alignTo(int x, int y) {
return ((x + y - 1) / y) * y;
}

class InferenceResult;
class BeamInferenceResult;

Expand Down Expand Up @@ -71,6 +75,10 @@ class BatchConfig {
static int max_verify_tokens_per_batch();
static int max_spec_tree_token_num();
static int max_sequence_length();

// paged attention
static size_t max_kv_cache_size();

friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
void print() const;
void save_to_file(std::string const &filename) const;
Expand Down Expand Up @@ -111,6 +119,15 @@ class BatchConfig {
int num_tokens_in_batch;
int max_length;

// paged attention
static constexpr size_t request_guid_size = sizeof(RequestGuid);
static constexpr size_t alignment = 16;
static constexpr size_t padding_size =
(alignment - (sizeof(int) * 3 + request_guid_size) % alignment) %
alignment;
static constexpr size_t padding_length = padding_size / sizeof(int);
int padding[padding_length] = {}; // Padding for memory pointer alignment

// request id in batch config:
int batch_config_request_id = -1;
bool prompt_phase = false;
Expand Down
9 changes: 9 additions & 0 deletions include/flexflow/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#ifndef _FLEXFLOW_CONFIG_H_
#define _FLEXFLOW_CONFIG_H_
#include "ffconst.h"
#include "flexflow/attention_config.h"
#include "flexflow/batch_config.h"
#include "legion.h"
#include <cstring>
Expand Down Expand Up @@ -89,14 +90,21 @@ struct FFHandler {
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
cudnnHandle_t dnn;
cublasHandle_t blas;
cudnnHandle_t peft_dnn;
cublasHandle_t peft_blas;
#else
miopenHandle_t dnn;
hipblasHandle_t blas;
miopenHandle_t peft_dnn;
hipblasHandle_t peft_blas;
#endif
void *workSpace;
size_t workSpaceSize;
CombinedBatchConfigMetaStruct *batch_config_metadata;

// flashinfer
AttentionMetaData *incr_attention_metadata;

// request info + token info + topolopgy mask info
size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
void *offload_reserve_space;
Expand All @@ -106,6 +114,7 @@ struct FFHandler {
bool allowTensorOpMathConversion;
#ifdef FF_USE_NCCL
ncclComm_t ncclComm;
ncclComm_t ncclCommPeft;
#endif
};

Expand Down
9 changes: 9 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,11 @@ flexflow_perf_metrics_t

void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);

void flexflow_model_set_num_transformer_layers(flexflow_model_t handle_, int num_layers);
void flexflow_model_set_num_kv_heads(flexflow_model_t handle_, int num_kv_heads);
void flexflow_model_set_qkv_dim(flexflow_model_t handle_, int qkv_dim);
void flexflow_model_set_size_dt(flexflow_model_t handle_, long unsigned int size_dt);

void flexflow_model_generate(flexflow_model_t handle_,
int num_requests,
enum RequestType *request_types,
Expand Down Expand Up @@ -1033,6 +1038,10 @@ void flexflow_request_manager_set_max_sequence_length(
int flexflow_request_manager_get_max_sequence_length(
flexflow_request_manager_t handle_);

// paged attention
void flexflow_request_manager_set_max_kv_cache_size(
flexflow_request_manager_t handle_, int max_kv_cache_size);

void flexflow_request_manager_set_max_concurrent_adapters(
flexflow_request_manager_t handle_, int max_concurrent_adapters);

Expand Down
1 change: 1 addition & 0 deletions include/flexflow/machine_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ struct ParallelConfig {
int device_ids[MAX_NUM_WORKERS];
#ifdef FF_USE_NCCL
ncclComm_t nccl_comms[MAX_NUM_WORKERS];
ncclComm_t nccl_comms_peft[MAX_NUM_WORKERS];
#endif
};

Expand Down
16 changes: 16 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,13 @@ class FFModel {
CompMode comp_mode = COMP_MODE_TRAINING);
void compile_inference();
void set_transformer_layer_id(int id);

// paged attention
void set_num_transformer_layers(int num_layers);
void set_num_kv_heads(int num_heads);
void set_qkv_dim(int qkv_dim);
void set_size_dt(int size_dt);

void set_position_offset(int offset);
void graph_optimize(size_t budget,
bool only_data_parallel,
Expand All @@ -1114,6 +1121,7 @@ class FFModel {
bool use_propagation) const;
#ifdef FF_USE_NCCL
ncclComm_t *find_nccl_comms(MachineView const &view) const;
ncclComm_t *find_nccl_comms_peft(MachineView const &view) const;
void finish_nccl_comms();
#endif
#ifdef FF_USE_PROPAGATE
Expand Down Expand Up @@ -1158,6 +1166,13 @@ class FFModel {
size_t op_global_guid, layer_global_guid, peft_model_global_guid;
size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
size_t current_transformer_layer_id;

// paged attention
int num_transformer_layers;
int num_kv_heads;
int qkv_dim;
int size_dt;

// positional embedding start offset
int position_offset;
FFConfig config;
Expand Down Expand Up @@ -1305,6 +1320,7 @@ class FFModel {
// inference_debugging mode.
#ifdef FF_USE_NCCL
std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms;
std::unordered_map<size_t, ncclComm_t *> view_hash_to_nccl_comms_peft;
#endif
private:
bool debug;
Expand Down
Loading
Loading