From 163850f7c9bac5a6f8a105869b8522436a6353e0 Mon Sep 17 00:00:00 2001 From: "zhenglaiwen.zlw" Date: Fri, 20 Dec 2024 14:45:21 +0800 Subject: [PATCH] some bugfix - uuid crash issue - update lora implement - set page size by param - delete deprecated files --- .gitmodules | 4 +- CMakeLists.txt | 15 +- build.sh | 12 +- cmake/FindHIEDNN.cmake | 46 -- cmake/cutlass.cmake | 4 +- cmake/flash-attention.cmake | 6 - cmake/hie-dnn.cmake | 3 +- csrc/CMakeLists.txt | 2 +- csrc/common/as_engine.cpp | 578 ++++++++++++---- csrc/common/device_context.h | 23 +- csrc/common/engine_runtime.h | 17 +- csrc/common/engine_worker.cpp | 24 +- csrc/common/engine_worker.h | 6 + csrc/common/result_queue.cpp | 22 +- csrc/common/thread_pool_with_id.h | 72 +- csrc/core/kernel/CMakeLists.txt | 29 +- csrc/core/kernel/cuda/cuda_kernel.h | 10 + csrc/core/kernel/cuda/cuda_util.h | 14 + .../kernel/cuda/gemm_lowp/gemm_a16w8_kernel.h | 1 + .../cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu | 3 +- .../kernel/cuda/gemm_lowp/gemm_lowp_common.h | 1 + .../kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh | 1 + csrc/core/kernel/cuda/moe/moe_kernel.cu | 17 +- csrc/core/kernel/cuda/moe/moe_kernel.h | 9 +- csrc/core/kernel/cuda/sgmv.cu | 238 +++++++ csrc/core/kernel/cuda/validate.cu | 81 +++ csrc/core/model/model.cpp | 38 +- .../operator/general/gemm/gemm_op_gpu.cpp | 42 ++ .../general/gemm_lora/gemm_capsule_op_gpu.cpp | 93 ++- .../general/gemm_lora/gemm_capsule_op_gpu.h | 2 +- .../general/gemm_lora/gemm_lora_op_gpu.cpp | 63 +- .../general/gemm_lora/gemm_lora_op_gpu.h | 10 +- .../general/gemm_lowp/gemm_a16w8_gpu.cpp | 4 +- .../general/gemm_lowp/gemm_a8w8_gpu.cpp | 4 +- csrc/core/operator/general/moe/moe_op.cpp | 325 +++++---- csrc/core/operator/general/moe/moe_op.h | 33 +- .../moe_inefficient/moe_inefficient_op.cpp | 309 --------- .../moe_inefficient/moe_inefficient_op.h | 62 -- .../general/sgmv_lora/sgmv_lora_op_gpu.cpp | 485 ++++++++++++++ .../general/sgmv_lora/sgmv_lora_op_gpu.h | 108 +++ .../general/sgmv_lora/sgmv_op_gpu.cpp | 96 +++ .../generate_opt/batch_mha/batch_mha_op.h | 5 - .../generate_opt/generate/generate_op.cpp | 4 +- .../span_attn/span_attn_op_cuda.h | 4 +- .../operator/nccl/allgather/allgather_op.cpp | 4 + csrc/core/operator/operator.cpp | 40 +- csrc/core/operator/operator.h | 1 - csrc/core/tensor/tensor.h | 9 + csrc/core/tensor/tensor_utils.cpp | 79 ++- csrc/device/cpu/cpu_context.h | 4 +- csrc/device/cuda/cuda_cache_allocator.cpp | 4 +- csrc/device/cuda/cuda_context.cpp | 14 + csrc/device/cuda/cuda_context.h | 3 + csrc/interface/allspark.h | 51 +- csrc/interface/allspark_check.h | 16 +- csrc/proto/allspark.proto | 1 + csrc/proto/allspark_service.proto | 4 +- .../cache/prefix_cache_manager_private.cpp | 4 +- csrc/runtime/state/model_control_state.cpp | 7 +- csrc/runtime/weight/weight_manager.cpp | 14 +- csrc/runtime/weight/weight_manager.h | 7 +- csrc/runtime/weight/weight_manager_lora.cpp | 88 ++- csrc/runtime/weight/weight_manager_lora.h | 27 +- csrc/service/allspark_client.cpp | 9 +- csrc/service/allspark_client_impl.cpp | 4 +- csrc/service/allspark_service_helper.h | 9 +- csrc/utility/blockingconcurrentqueue.h | 586 +++++++++++++++++ csrc/utility/lightweightsemaphore.h | 432 ++++++++++++ csrc/utility/string_util.cpp | 25 + csrc/utility/string_util.h | 2 + .../get_started/quick_start_api_server_en.md | 4 +- examples/benchmark/benchmark_throughput.py | 242 +++++-- examples/python/01_sync_example_bf16_gpu.py | 3 + python/allspark_binding.cpp | 11 +- python/allspark_binding_common.h | 13 + python/pyhie/allspark/__init__.py | 1 - python/pyhie/allspark/config/diconfig.py | 3 + python/pyhie/allspark/model/qwen_v15.py | 1 + python/pyhie/allspark/model/qwen_v20_moe.py | 1 + python/pyhie/allspark/quantization.py | 4 +- python/pyhie/allspark/runtime_config.py | 29 +- python/setup.py | 5 +- .../clang-format/clang-format-apply-all.sh | 4 +- scripts/clang-format/clang-format-apply.sh | 4 +- scripts/copyright/add_copyright.py | 93 --- scripts/docker/build_cu124.sh | 4 - scripts/docker/build_docker.sh | 47 -- scripts/docker/build_fschat_ubuntu_cuda.sh | 9 - scripts/docker/build_fschat_ubuntu_x86.sh | 9 - scripts/docker/dev_arm_centos8.Dockerfile | 4 - scripts/docker/dev_cuda_124.Dockerfile | 4 - scripts/docker/dev_x86_centos7.Dockerfile | 4 - scripts/docker/fschat_ubuntu_cuda.Dockerfile | 5 +- .../release_aarch64_manylinux2.Dockerfile | 5 +- .../docker/release_x86_manylinux2.Dockerfile | 4 - .../release/python_manylinux_build_cuda.sh | 1 - scripts/yapf-format/auto-format-python.py | 40 -- .../kernel/cuda/kernel_mhaprefill_test.cpp | 3 +- tests/cpp/model/cuda/model_cuda_test.cpp | 620 +----------------- .../model/stresstest/model_stress_test.cpp | 10 +- .../operator/cuda/operator_gemm_lowp_test.cpp | 1 + tests/python/arm/test_01_m6_7b_master.py | 109 --- tests/python/arm/test_01_m6_7b_worker.py | 88 --- tests/python/arm/test_02_m6_13b_master.py | 107 --- tests/python/arm/test_02_m6_13b_worker.py | 87 --- tests/python/gpu/test_04_model_serializer.py | 238 ------- tests/python/gpu/test_08_long_text.py | 62 -- tests/python/gpu/test_09_qwen2_repeat.py | 87 --- tests/python/gpu/test_10_qwen1_5_moe_a2_7b.py | 240 ------- tests/python/gpu/test_11_qwen2_json.py | 459 ------------- tests/python/gpu/test_12_dynamic_quant.py | 178 ----- tests/python/gpu/test_di_model_config.py | 66 -- tests/python/gpu/test_di_model_config_01.yaml | 191 ------ tests/python/gpu/test_runtime_config.py | 48 -- tests/python/gpu/test_util_infer.py | 269 -------- tests/python/gpu/test_utils.py | 137 ---- tests/python/x86/dashinfer | 1 - tests/python/x86/test_11_qwen2_json.py | 420 ------------ tools/detect-lora-limit.py | 190 ++++++ tools/model_scope_batch_convert.py | 3 + tools/tokenizer_tool_for_qwen.py | 2 +- 121 files changed, 3886 insertions(+), 4653 deletions(-) delete mode 100644 cmake/FindHIEDNN.cmake create mode 100644 csrc/core/kernel/cuda/cuda_util.h create mode 100644 csrc/core/kernel/cuda/sgmv.cu create mode 100644 csrc/core/kernel/cuda/validate.cu delete mode 100644 csrc/core/operator/general/moe_inefficient/moe_inefficient_op.cpp delete mode 100644 csrc/core/operator/general/moe_inefficient/moe_inefficient_op.h create mode 100644 csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.cpp create mode 100644 csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.h create mode 100644 csrc/core/operator/general/sgmv_lora/sgmv_op_gpu.cpp create mode 100644 csrc/utility/blockingconcurrentqueue.h create mode 100644 csrc/utility/lightweightsemaphore.h delete mode 100644 scripts/copyright/add_copyright.py delete mode 100755 scripts/docker/build_cu124.sh delete mode 100755 scripts/docker/build_docker.sh delete mode 100755 scripts/docker/build_fschat_ubuntu_cuda.sh delete mode 100755 scripts/docker/build_fschat_ubuntu_x86.sh delete mode 100644 scripts/yapf-format/auto-format-python.py delete mode 100644 tests/python/arm/test_01_m6_7b_master.py delete mode 100644 tests/python/arm/test_01_m6_7b_worker.py delete mode 100644 tests/python/arm/test_02_m6_13b_master.py delete mode 100644 tests/python/arm/test_02_m6_13b_worker.py delete mode 100644 tests/python/gpu/test_04_model_serializer.py delete mode 100644 tests/python/gpu/test_08_long_text.py delete mode 100644 tests/python/gpu/test_09_qwen2_repeat.py delete mode 100644 tests/python/gpu/test_10_qwen1_5_moe_a2_7b.py delete mode 100644 tests/python/gpu/test_11_qwen2_json.py delete mode 100644 tests/python/gpu/test_12_dynamic_quant.py delete mode 100644 tests/python/gpu/test_di_model_config.py delete mode 100644 tests/python/gpu/test_di_model_config_01.yaml delete mode 100644 tests/python/gpu/test_runtime_config.py delete mode 100644 tests/python/gpu/test_util_infer.py delete mode 100644 tests/python/gpu/test_utils.py delete mode 120000 tests/python/x86/dashinfer delete mode 100644 tests/python/x86/test_11_qwen2_json.py create mode 100644 tools/detect-lora-limit.py diff --git a/.gitmodules b/.gitmodules index e7e1ebc7..909ff4a3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "third_party/from_source/cutlass"] - path = third_party/from_source/cutlass - url = https://github.com/NVIDIA/cutlass.git + path = third_party/from_source/cutlass + url = https://github.com/NVIDIA/cutlass.git diff --git a/CMakeLists.txt b/CMakeLists.txt index e330c148..56c061b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ set(CONFIG_HOST_CPU_TYPE "X86" CACHE STRING "host cpu type, like X86, ARMV9, etc ## option(ENABLE_NV_STATIC_LIB "build with static lib of nvidia" OFF) option(USE_SYSTEM_NV_LIB "use system nccl lib instead download binary." OFF) +option(BUILD_HIEDNN "build HIE-DNN from source" OFF) option(ENABLE_CUDA_PINNED_WEIGHT_LOAD "enable cuda pinned memory for load weight" OFF) option(ENABLE_SPAN_ATTENTION "enable build with span attention" ON) option(ENABLE_MULTINUMA "enable multinuma, if on cpu multinuma service will be compiled" OFF) @@ -174,18 +175,14 @@ if(LOCK_CHECK) endif() if (ENABLE_CUDA) - include(cuda) list(APPEND ALLSPARK_DEFINITION "-DENABLE_CUDA") - - if (ENABLE_CUSPARSELT) - set(ENABLE_CUSPARSELT ON) - list(APPEND ALLSPARK_DEFINITION "-DENABLE_CUSPARSELT") - endif() - + if (ENABLE_CUSPARSELT) + list(APPEND ALLSPARK_DEFINITION "-DENABLE_CUSPARSELT") + endif() + include(cuda) if (ENABLE_SPARSE) - list(APPEND ALLSPARK_DEFINITION "-DENABLE_SPARSE") + list(APPEND ALLSPARK_DEFINITION "-DENABLE_SPARSE") endif() - if (ENABLE_FP8) list(APPEND ALLSPARK_DEFINITION "-DENABLE_FP8") endif() diff --git a/build.sh b/build.sh index e9a02502..6717e981 100755 --- a/build.sh +++ b/build.sh @@ -8,6 +8,7 @@ with_platform="${AS_PLATFORM:-cuda}" cuda_version="${AS_CUDA_VERSION:-12.4}" cuda_sm="${AS_CUDA_SM:-80;86;90a}" NCCL_VERSION="${AS_NCCL_VERSION:-2.23.4}" +build_folder="${AS_BUILD_FOLDER:-build}" ## NCCL Version Map: ## the corresponding pre-build nccl will download on oss. @@ -22,9 +23,9 @@ build_type="${AS_BUILD_TYPE:-Release}" cuda_static="${AS_CUDA_STATIC:-OFF}" build_package="${AS_BUILD_PACKAGE:-ON}" enable_glibcxx11_abi="${AS_CXX11_ABI:-OFF}" +build_hiednn="${AS_BUILD_HIEDNN:-ON}" enable_span_attn="${ENABLE_SPAN_ATTENTION:-ON}" enable_multinuma="${ENABLE_MULTINUMA:-OFF}" - function clone_pull { GIT_URL=$1 DIRECTORY=$2 @@ -42,11 +43,11 @@ function clone_pull { } if [ "$clean" == "ON" ]; then - rm -rf build + rm -rf ${build_folder} fi -if [ ! -d "./build" ]; then - mkdir build && cd build +if [ ! -d "./${build_folder}" ]; then + mkdir ${build_folder} && cd ${build_folder} conan profile new dashinfer_compiler_profile --detect --force conanfile=../conan/conanfile.txt @@ -74,7 +75,7 @@ if [ ! -d "./build" ]; then cd ../ fi -cd build +cd ${build_folder} source ./activate.sh export PATH=`pwd`/bin:$PATH @@ -94,6 +95,7 @@ if [ "${with_platform,,}" == "cuda" ]; then -DBUILD_PYTHON=OFF \ -DALWAYS_READ_LOAD_MODEL=OFF \ -DENABLE_SPAN_ATTENTION=${enable_span_attn} \ + -DBUILD_HIEDNN=${build_hiednn} \ -DENABLE_MULTINUMA=OFF elif [ "${with_platform,,}" == "x86" ]; then cmake .. \ diff --git a/cmake/FindHIEDNN.cmake b/cmake/FindHIEDNN.cmake deleted file mode 100644 index 8f7818ed..00000000 --- a/cmake/FindHIEDNN.cmake +++ /dev/null @@ -1,46 +0,0 @@ -message("========== HIE-DNN ==========") -set(USE_FP16 - ${ENABLE_FP16} - CACHE BOOL "") -set(USE_BF16 - ${ENABLE_BF16} - CACHE BOOL "") -set(CUDA_DEVICE_ARCH - ${CMAKE_CUDA_ARCHITECTURES} - CACHE STRING "") - -if(ENABLE_CUDA) - set(USE_CUDA - ON - CACHE BOOL "") -else() - set(USE_CUDA - OFF - CACHE BOOL "") -endif() - -# disable hie-dnn utest & examples -set(UTEST - OFF - CACHE BOOL "") -set(EXAMPLE - OFF - CACHE BOOL "") - -message(STATUS "\tBuild HIE-DNN with: CUDA_DEVICE_ARCH=${CUDA_DEVICE_ARCH}") -message(STATUS "\tBuild HIE-DNN with: USE_FP16=${USE_FP16}") -message(STATUS "\tBuild HIE-DNN with: USE_BF16=${USE_BF16}") -message(STATUS "\tBuild HIE-DNN with: USE_CUDA=${USE_CUDA}") -set(HIEDNN_SOURCE_DIR ${PROJECT_SOURCE_DIR}/HIE-DNN) -add_subdirectory(${HIEDNN_SOURCE_DIR} EXCLUDE_FROM_ALL) -set_target_properties(hiednn PROPERTIES FOLDER "External/HIE-DNN") -set_target_properties(hiednn_static PROPERTIES FOLDER "External/HIE-DNN") -unset(CUDA_DEVICE_ARCH CACHE) -unset(USE_FP16 CACHE) -unset(USE_BF16 CACHE) -unset(USE_CUDA CACHE) -unset(UTEST CACHE) -unset(EXAMPLE CACHE) -message(STATUS "Build HIE-DNN in: ${HIEDNN_SOURCE_DIR}") - -message("=============================") diff --git a/cmake/cutlass.cmake b/cmake/cutlass.cmake index aaa597c4..5a2d45a8 100644 --- a/cmake/cutlass.cmake +++ b/cmake/cutlass.cmake @@ -7,8 +7,8 @@ set(CUTLASS_NVCC_ARCHS ${CMAKE_CUDA_ARCHITECTURES} CACHE STRING "The SM architec set(CUTLASS_INSTALL ${INSTALL_LOCATION}/cutlass/install) message(STATUS "CUTLASS_INSTALL: ${CUTLASS_INSTALL}") -message(STATUS "Use cutlass from submodule") -set(CUTLASS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass) + message(STATUS "Use cutlass from submodule") + set(CUTLASS_SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass) include(ExternalProject) diff --git a/cmake/flash-attention.cmake b/cmake/flash-attention.cmake index db646521..8fbc5150 100644 --- a/cmake/flash-attention.cmake +++ b/cmake/flash-attention.cmake @@ -35,12 +35,9 @@ else() set(FLASHATTN_LIBRARY_NAME libflash-attn.so) endif() -option(FLASHATTN_BUILD_FROM_SOURCE "build flash-attn from source or use prebuilt lib" ON) -# make sure you copy prebuild and source code under csrc together, to avoid header aligment issue. include(ExternalProject) -if (FLASHATTN_BUILD_FROM_SOURCE) message(STATUS "build flash-attention from source") message(STATUS "Use flash-attention from external project") @@ -78,9 +75,6 @@ if (FLASHATTN_BUILD_FROM_SOURCE) ExternalProject_Get_Property(project_flashattn SOURCE_SUBDIR) set(FLASHATTN_INCLUDE_DIR ${SOURCE_DIR}/${SOURCE_SUBDIR}) -else() # FLASHATTN_BUILD_FROM_SOURCE - message(FATAL_ERROR "flash attention build only support source code build.") -endif() # FLASHATTN_BUILD_FROM_SOURCE message(STATUS "FLASHATTN_LIBRARY_PATH: ${FLASHATTN_LIBRARY_PATH}") message(STATUS "FLASHATTN_INCLUDE_DIR: ${FLASHATTN_INCLUDE_DIR}") diff --git a/cmake/hie-dnn.cmake b/cmake/hie-dnn.cmake index 8f3903ce..77058977 100644 --- a/cmake/hie-dnn.cmake +++ b/cmake/hie-dnn.cmake @@ -29,7 +29,8 @@ set(HIEDNN_LIBRARY_PATH ${HIEDNN_INSTALL}/lib64/libhiednn_static.a) message(STATUS "HIEDNN_INSTALL: ${HIEDNN_INSTALL}") message(STATUS "HIEDNN_LIBRARY_PATH: ${HIEDNN_LIBRARY_PATH}") -set(HIEDNN_SOURCE_DIR ${PROJECT_SOURCE_DIR}/HIE-DNN) + set(HIEDNN_SOURCE_DIR ${PROJECT_SOURCE_DIR}/HIE-DNN) +message(STATUS "Build HIE-DNN in: ${HIEDNN_SOURCE_DIR}") include(ExternalProject) diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt index 4696eb6e..6e39f571 100644 --- a/csrc/CMakeLists.txt +++ b/csrc/CMakeLists.txt @@ -41,7 +41,7 @@ set(ALLSPARK_INC_DIR ${CMAKE_BINARY_DIR}/csrc ) -list(APPEND ALLSPARK_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass/include) + list(APPEND ALLSPARK_INC_DIR ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass/include) set(ALLSPARK_3RD_LIBS ${THREAD_LIB} diff --git a/csrc/common/as_engine.cpp b/csrc/common/as_engine.cpp index 6b1374fc..633cdb0f 100644 --- a/csrc/common/as_engine.cpp +++ b/csrc/common/as_engine.cpp @@ -5,10 +5,8 @@ #include "engine_worker.h" #include "interface/allspark_check.h" -#include "thread_pool.h" #include "thread_pool_with_id.h" #include "thread_utils.h" -#include "utility/timer.h" #include "weight/weight_loader.h" #include "weight/weight_manager.h" #ifdef ENABLE_CUDA @@ -26,14 +24,18 @@ #include #include #include +#include #include +#include #include #include #include +#include #include #include #include +#include #include #include #include @@ -76,13 +78,15 @@ const int warmup_input = 5; // stopping. #define CHECK_MODEL_STOPPING(model_state) \ do { \ - if (model_state->model_stopping) { \ + if (model_state->model_stopping.load()) { \ LOG(INFO) << "model is stopping, access denied"; \ return AsStatus::ALLSPARK_REQUEST_DENIED; \ } \ } while (0) +namespace fs = std::filesystem; namespace allspark { + class AsEngineImpl final { public: AsEngineImpl(); @@ -100,13 +104,16 @@ class AsEngineImpl final { AsStatus LoadLoraByName(const char* model_name, const char* lora_name); AsStatus UnloadLoraByName(const char* model_name, const char* lora_name); + std::vector LoadFakeLoras(const char* model_name); + void UnloadFakeLoras(const char* model_name, + const std::vector& fake_lora_names); AsStatus GetModelInformation(const char* model_name, std::string* model_info); AsFileInfo GetFileInformation(const char* as_model_path, const char* as_param_path); - AsStatus StartModel(const char* model_name, bool do_warmup = true); + AsStatus StartModel(const char* model_name); AsStatus TunePrefixCache(const char* model_name); AsStatus StopModel(const char* model_name); AsStatus ReleaseModel(const char* model_name); @@ -142,11 +149,15 @@ class AsEngineImpl final { AsStatus BuildModel(const char* model_name, const std::string& model_proto, std::shared_ptr weight_handler, const std::map& model_limits = {}); - AsStatus WarmupModel(const char* model_name, int64_t min_bytes_available); + AsStatus WarmupModelInternal_(const char* model_name, + int64_t min_bytes_available, + std::vector& fake_lora_names); + AsStatus WarmupModel(const char* model_name); AsStatus SetNumThreads(int num_threads); AsStatus SetDeviceIds(const std::vector& device_ids); AsStatus CreateDeviceContext(const std::string& compute_unit); + void DestroyDeviceContext(); AsStatus SetMatmulPrecision(const std::string& precision); #if ENABLE_SPAN_ATTENTION AsStatus setSpanCacheConfig(AsCacheMode mode, int span_size, @@ -211,8 +222,10 @@ class AsEngineImpl final { model_state_map_; // TODO: engine needs support two or model can running within one engine. - std::mutex engine_lock_; // for async decoder lock - std::mutex lora_lock_; // mutex for load/unload lora + std::mutex engine_lock_; // for async decoder lock + std::mutex lora_lock_; // mutex for lora WeightManager + std::mutex lora_usage_lock_; // mutex for loras_in_use_ + fs::path fake_lora_temp_dir_; int engine_max_length_ = 0; int engine_max_batch_ = 0; int engine_max_prefill_length_ = 0; @@ -225,17 +238,18 @@ class AsEngineImpl final { std::mt19937 random_engine; std::shared_ptr weight_manager_; - std::map> loras_in_use_; + std::unordered_map> loras_in_use_; + std::atomic lora_use_count_; PrefixCacheCoordinator::Ptr prefix_cache_coordinator_; }; ModelControlState::ModelControlState(const std::string& name) - : model_name(name) { - lock = std::make_unique(); + : model_name(name), msg_queue(1000) { cond_var = std::make_unique(); request_handle_map.reserve(1000); result_queue_map.reserve(1000); + msg_queue_size.store(0); } static bool ReadProtoFromTextFile(const char* filename, Message* proto) { @@ -282,12 +296,17 @@ static std::pair> ParseDeviceType( AsEngineImpl::AsEngineImpl() : device_ctx_(std::make_unique()), is_multi_nodes_(false), - threadpool_size_(1) { + threadpool_size_(1), + lora_use_count_(0) { util::as_init_log(); // set threadpool_size_ to 1 for default to avoid additional overhead, // such as thread switching and lock contention in CPU streaming mode. threadpool_ = std::make_unique(threadpool_size_); + + device_ctx_->Init(); + weight_manager_ = WeightManager::Create(); + weight_manager_->RegisterWeightEventListener( [&](const std::shared_ptr& handler, WeightEvent event) { @@ -299,6 +318,7 @@ AsEngineImpl::AsEngineImpl() } } }); + std::random_device rand_dev; random_engine.seed(rand_dev()); LOG(INFO) << "AllSpark Init with Version: " << GetVersionFull(); @@ -309,7 +329,6 @@ AsEngineImpl::~AsEngineImpl() { // exception. LOG(INFO) << "~AsEngine called"; for (auto& model_state : model_state_map_) { - std::unique_lock lock(*(model_state.second->lock)); if (!model_state.second && model_state.second->model_stopped) { LOG(INFO) << "Stopping model " << model_state.first; StopModel(model_state.first.c_str()); @@ -417,14 +436,12 @@ AsStatus AsEngineImpl::SetDeviceIds(const std::vector& device_ids) { } // 所有device inferer都走worker线程 workers_.resize(nranks_); - #ifdef ENABLE_CUDA ncclUniqueId id; if (backend == DeviceType::CUDA) { ncclGetUniqueId(&id); } #endif - std::vector vthreads(nranks_); LOG(INFO) << "Start create " << nranks_ << " Device: " << backend << " workers."; @@ -450,6 +467,7 @@ AsStatus AsEngineImpl::SetDeviceIds(const std::vector& device_ids) { } // cuda require multiple nccl client init in parallel, otherwise // will wait for other device. + workers_[i]->Init(); workers_[i]->InitCCL(i, nranks_); workers_[i]->SetWeightManager(weight_manager_); }); @@ -497,6 +515,11 @@ AsStatus AsEngineImpl::CreateDeviceContext(const std::string& compute_unit) { return AsStatus::ALLSPARK_SUCCESS; } +void AsEngineImpl::DestroyDeviceContext() { + is_device_id_set_ = false; + DestroyBFCAllocator(); +} + static void CheckAndOverridePrefillMode(AsModelConfig& model_config) { try { DeviceType device_type = DeviceType::CUDA; @@ -518,7 +541,6 @@ static void CheckAndOverridePrefillMode(AsModelConfig& model_config) { device_ids.size() > 0 ? device_ids[0] : 0); LOG(INFO) << "Auto Prefill selection, CUDA Detected, SM: " << std::hex << sm_version; - if (sm_version >= CUDASMDef::SM_Ampere && CUDA_VERSION >= 11080) { model_config.prefill_mode = AsMHAPrefill::AsPrefillFlashV2; LOG(INFO) << "Prefill Auto Select: Ampler GPU detected, choose " @@ -701,17 +723,35 @@ AsStatus AsEngineImpl::BuildModel( std::shared_ptr weight_handler, const std::map& model_limits) { DLOG(INFO) << "AsEngineImpl::BuildModel()" << std::endl; + AsModelConfig model_config = weight_handler->GetModelConfig(); std::unique_ptr model_ir = std::make_unique(); model_ir->ParseFromString(model_proto); // LOG(INFO)<model_conf().num_heads()<<" // "<model_conf().size_per_head()<<" // "<model_conf().dec_layer(); + auto& graph = model_ir->graphs(); + device_ctx_->SetLoraEnabled(false); + for (auto& g_name : model_ir->graph_names()) { // search for LoRA op + for (auto& op_proto : graph.at(g_name).ops()) { + if (op_proto.op_type() == "GemmLoraCapsule") { + device_ctx_->SetLoraEnabled(true); + break; + } + } + } + if (device_ctx_->GetLoraEnabled()) { + LOG(INFO) << "lora enabled"; + } + device_ctx_->SetNumberHeads(model_ir->model_conf().num_heads()); device_ctx_->SetNumberGroups(model_ir->model_conf().multi_query_group_num()); device_ctx_->SetSizePerHead(model_ir->model_conf().size_per_head()); + device_ctx_->SetIntermediateSize(model_ir->model_conf().intermediate_size()); device_ctx_->SetDecoderLayer(model_ir->model_conf().dec_layer()); device_ctx_->SetDtype(model_ir->model_conf().dtype()); + device_ctx_->SetLoraMaxNum(model_config.lora_max_num); + device_ctx_->SetLoraMaxRank(model_config.lora_max_rank); for (auto& item : model_limits) { if (item.second < 0) { LOG(ERROR) << "invalid engine limit param, should >= 0" << std::endl; @@ -797,24 +837,11 @@ AsStatus AsEngineImpl::BuildModel( std::max(warmup_single_batch_spans, warmup_multi_batch_spans); int num_spans = kv_cache_count * device_ctx_->GetDecoderLayer() * (num_spans_per_seq + 1); - bool reserve_for_lora = false; - auto& graph = model_ir->graphs(); - for (auto& g_name : model_ir->graph_names()) { // search for LoRA op - for (auto& op_proto : graph.at(g_name).ops()) { - if (op_proto.op_type() == "GemmLoraCapsule") { - reserve_for_lora = true; - break; - } - } - } - if (reserve_for_lora) num_spans *= device_ctx_->GetModelMaxBatch(); - DLOG(INFO) << "reserve_for_lora=" << reserve_for_lora - << " num_spans=" << num_spans; // reset cache config AS_CHECK_STATUS(this->setSpanCacheConfig(device_ctx_->GetCacheMode(), device_ctx_->GetCacheSpanSize(), num_spans, 0)); - use_adaptive_cache_ = reserve_for_lora ? false : true; + use_adaptive_cache_ = true; } } #endif @@ -993,14 +1020,15 @@ AsStatus AsEngineImpl::TunePrefixCache(const char* model_name) { float duration_in_ms[2] = {0}; for (int j = 0; j < 2; j++) { - std::string uuid = "warmup_request_" + std::to_string(request_id) + - "_" + std::to_string(j); + warmup_req->config.uuid = "warmup_request_" + + std::to_string(request_id) + "_" + + std::to_string(j); RequestHandle* warmup_handle{nullptr}; AsEngine::ResultQueue* warmup_queue{nullptr}; auto start_time_point = std::chrono::steady_clock::now(); - AS_CHECK_STATUS(this->StartRequest( - model_name, warmup_req, &warmup_handle, &warmup_queue, uuid)); + AS_CHECK_STATUS(this->StartRequest(model_name, warmup_req, + &warmup_handle, &warmup_queue)); AS_CHECK_STATUS(this->SyncRequest(model_name, warmup_handle)); AS_CHECK_STATUS(this->ReleaseRequest(model_name, warmup_handle)); auto end_time_point = std::chrono::steady_clock::now(); @@ -1029,13 +1057,174 @@ AsStatus AsEngineImpl::TunePrefixCache(const char* model_name) { return AsStatus::ALLSPARK_SUCCESS; } +std::vector AsEngineImpl::LoadFakeLoras(const char* model_name) { + std::vector ret; + if (!device_ctx_->GetLoraEnabled()) { + return ret; + } + + char temp_dir[] = "/tmp/allspark_fake_lora-XXXXXX"; + AS_ENFORCE(mkdtemp(temp_dir) != nullptr); + fake_lora_temp_dir_ = temp_dir; + LOG(INFO) << "Successfully created fake lora temp dir: " + << fake_lora_temp_dir_; + + auto num_heads = device_ctx_->GetNumberHeads(); + auto size_per_head = device_ctx_->GetSizePerHead(); + auto hidden_size = num_heads * size_per_head; + auto num_groups = device_ctx_->GetNumberGroups(); + auto kv_size = num_groups * size_per_head; + auto num_hidden_layers = device_ctx_->GetDecoderLayer(); + auto intermediate_size = device_ctx_->GetIntermediateSize(); + auto lora_max_num = device_ctx_->GetLoraMaxNum(); + auto lora_max_rank = device_ctx_->GetLoraMaxRank(); + auto dtype = device_ctx_->GetDtype(); + + // create fake loras + const std::string lora_base_name = "fake-lora-"; + fs::path aslora_path = fake_lora_temp_dir_ / (lora_base_name + "0.aslora"); + TensorMap tensor_map; + char dtype_ch = dtype == DataType::FLOAT16 ? 'f' : 'b'; + std::map attr_map{ + {"decoder.layer.__LAYER__.attention.self.lora_A.weight", + {0, + SplitMode::NOSPLIT, + {hidden_size, 3 * lora_max_rank}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.attention.self.lora_B.weight", + {0, + SplitMode::GROUP_VSPLIT, + {lora_max_rank, hidden_size + 2 * kv_size}, + {hidden_size, kv_size, kv_size}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.attention.output.dense.lora_A.weight", + {0, + SplitMode::HSPLIT, + {hidden_size, lora_max_rank}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.attention.output.dense.lora_B.weight", + {0, + SplitMode::NOSPLIT, + {lora_max_rank, hidden_size}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.intermediate.dense.lora_A.weight", + {0, + SplitMode::NOSPLIT, + {hidden_size, lora_max_rank}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.intermediate.dense.lora_B.weight", + {0, + SplitMode::VSPLIT, + {lora_max_rank, intermediate_size}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.linear.dense.lora_A.weight", + {0, + SplitMode::NOSPLIT, + {hidden_size, lora_max_rank}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.linear.dense.lora_B.weight", + {0, + SplitMode::VSPLIT, + {lora_max_rank, intermediate_size}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.output.dense.lora_A.weight", + {0, + SplitMode::HSPLIT, + {intermediate_size, lora_max_rank}, + {}, + dtype_ch, + 2, + 0}}, + {"decoder.layer.__LAYER__.ffn.output.dense.lora_B.weight", + {0, + SplitMode::NOSPLIT, + {lora_max_rank, hidden_size}, + {}, + dtype_ch, + 2, + 0}}, + }; + std::ofstream fout(aslora_path.string(), std::ios::out); // 清空文件 + for (int layer = 0; layer < num_hidden_layers; layer++) { + for (auto& [tensor_pattern, tensor_info] : attr_map) { + std::string t_name = tensor_pattern; + t_name.replace(t_name.find("__LAYER__"), strlen("__LAYER__"), + std::to_string(layer)); + auto nbytes = 1; + for (auto size : tensor_info.shape) { + nbytes *= size; + } + nbytes *= tensor_info.word_size; + std::vector bin_data(nbytes, 0); + util::save_allsparky_tofile(aslora_path.string(), t_name, bin_data.data(), + nbytes, tensor_info); + } + } + util::set_global_header(aslora_path.string()); // 结束 + + // load fake loras + ret.emplace_back(lora_base_name + "0"); + AS_ENFORCE(AS_STATUS_OK(LoadLoraByName(model_name, aslora_path.c_str()))); + for (int i = 1; i < lora_max_num; i++) { + auto lora_name = lora_base_name + std::to_string(i); + auto symlink_path = fake_lora_temp_dir_ / (lora_name + ".aslora"); + symlink(aslora_path.c_str(), symlink_path.c_str()); + ret.emplace_back(lora_name); + AS_ENFORCE(AS_STATUS_OK(LoadLoraByName(model_name, symlink_path.c_str()))); + } + return ret; +} + +void AsEngineImpl::UnloadFakeLoras( + const char* model_name, const std::vector& fake_lora_names) { + if (!device_ctx_->GetLoraEnabled()) { + return; + } + for (auto& lora_name : fake_lora_names) { + AS_ENFORCE(AS_STATUS_OK(UnloadLoraByName(model_name, lora_name.c_str()))); + } + try { + fs::remove_all(fake_lora_temp_dir_); + } catch (const std::exception& e) { + LOG(ERROR) << "Failed to remove fake lora temp dir: " << fake_lora_temp_dir_ + << ", reason: " << e.what(); + } catch (...) { + LOG(ERROR) << "Failed to remove fake lora temp dir: " << fake_lora_temp_dir_ + << ", unknown exception!"; + } +} + /* * send fake request to warm up engine, allocate necessary resources, like gpu * tensor, etc. send one max_length request, and N (max_engine_batch_size) times * request to make sure the coverage. */ -AsStatus AsEngineImpl::WarmupModel(const char* model_name, - int64_t min_bytes_available) { +AsStatus AsEngineImpl::WarmupModelInternal_( + const char* model_name, int64_t min_bytes_available, + std::vector& fake_lora_names) { //* step 1: record memory usage before warmup std::vector bytes_limit(nranks_); std::vector bytes_before_warmup(nranks_); @@ -1056,6 +1245,9 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, warmup_cfg->max_length = engine_max_length_; warmup_cfg->top_k = 0; warmup_cfg->top_p = 0.5; + if (device_ctx_->GetLoraEnabled()) { + warmup_cfg->lora_name = fake_lora_names[0]; + } std::shared_ptr warmup_req = std::make_shared(); @@ -1073,8 +1265,9 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, AS_CHECK_STATUS(this->SyncRequest(model_name, warmup_handle)); if (warmup_queue->GenerateStatus() != AsEngine::GenerateRequestStatus::GenerateFinished) { - LOG(ERROR) << "AsEngineImpl::WarmupModel: warmup failed! Please check " - "engine_max_length & engine_max_batch"; + LOG(ERROR) + << "AsEngineImpl::WarmupModelInternal_: warmup failed! Please check " + "engine_max_length & engine_max_batch"; return AsStatus::ALLSPARK_RUNTIME_ERROR; } AS_CHECK_STATUS(this->ReleaseRequest(model_name, warmup_handle)); @@ -1120,6 +1313,10 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, warmup_cfg->top_k = 0; warmup_cfg->top_p = 0.5; warmup_cfg->early_stopping = false; + if (device_ctx_->GetLoraEnabled()) { + warmup_cfg->lora_name = fake_lora_names[i % fake_lora_names.size()]; + } + std::shared_ptr warmup_req = std::make_shared(); warmup_req->config = *(warmup_cfg); @@ -1141,8 +1338,9 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, AsEngine::ResultQueue* warmup_queue = warmup_queue_list[i]; if (warmup_queue->GenerateStatus() != AsEngine::GenerateRequestStatus::GenerateFinished) { - LOG(ERROR) << "AsEngineImpl::WarmupModel: warmup failed! Please check " - "engine_max_length & engine_max_batch"; + LOG(ERROR) + << "AsEngineImpl::WarmupModelInternal_: warmup failed! Please check " + "engine_max_length & engine_max_batch"; return AsStatus::ALLSPARK_RUNTIME_ERROR; } AS_CHECK_STATUS(this->ReleaseRequest(model_name, warmup_handle)); @@ -1211,7 +1409,7 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, try { return workers_[i]->Warmup(_bytes_available, _bytes_runtime); } catch (std::exception& e) { - LOG(ERROR) << "AsEngineImpl::WarmupModel: worker " << i + LOG(ERROR) << "AsEngineImpl::WarmupModelInternal_: worker " << i << " warmup failed: " << e.what(); if (std::string(e.what()) == "ALLSPARK_MEMORY_ERROR" || std::string(e.what()) == "ALLSPARK_CACHE_MEMORY_OUT") { @@ -1235,42 +1433,25 @@ AsStatus AsEngineImpl::WarmupModel(const char* model_name, if (ret == AsStatus::ALLSPARK_SUCCESS) { LOG(INFO) << "warm-up: all workers successfully finished!"; } + return ret; } else { LOG(WARNING) << "warm-up: invalid memory usage, min_bytes_available=" << min_bytes_available << ", max_bytes_runtime=" << max_bytes_runtime << ", worker warm-up is skipped"; + return AsStatus::ALLSPARK_SUCCESS; } } -AsStatus AsEngineImpl::StartModel(const char* model_name, bool do_warmup) { - util::as_init_log(); - - // start model - DLOG(INFO) << "[" << model_name << "] " - << "AsEngineImpl::StartModel"; - // create a new model - { - auto name = std::string(model_name); - as_stat_ = std::make_unique(name); - model_state_map_[name] = std::make_shared(name); - model_state_map_[name]->StartLoop(&AsEngineImpl::ModelRunningThread, this, - name, model_state_map_[name]); - } - -#if ENABLE_SPAN_ATTENTION - if (device_ctx_->GetDeviceType() == DeviceType::CUDA) { - if (device_ctx_->GetCacheSpanSize() == 0) { - LOG(INFO) << "StartModel: span cache is disabled, skip warm-up"; - return AsStatus::ALLSPARK_SUCCESS; - } +AsStatus AsEngineImpl::WarmupModel(const char* model_name) { + if (EnvVarConfig::GetInt("ALLSPARK_DISABLE_WARMUP", 0) == 1) { + return AsStatus::ALLSPARK_SUCCESS; } -#endif - // start the thread pool - ExpandRankThreadPool(); + // generate and load fake loras upto limit + auto fake_lora_names = LoadFakeLoras(model_name); // collect mem stats from all workers int64_t min_bytes_available = std::numeric_limits::max(); @@ -1299,6 +1480,8 @@ AsStatus AsEngineImpl::StartModel(const char* model_name, bool do_warmup) { } if (failed_ret != AsStatus::ALLSPARK_SUCCESS) { + UnloadFakeLoras(model_name, + fake_lora_names); // just free fake loras if failed return failed_ret; } else { LOG(INFO) << "StartModel: min available device memory in MB" @@ -1307,14 +1490,44 @@ AsStatus AsEngineImpl::StartModel(const char* model_name, bool do_warmup) { } } - int env_disable_warmup = - EnvVarConfig::GetInt("ALLSPARK_DISABLE_WARMUP", 0) == 1; - if (!do_warmup || env_disable_warmup) { - return AsStatus::ALLSPARK_SUCCESS; - } else { - return WarmupModel(model_name, min_bytes_available); + auto ret = + WarmupModelInternal_(model_name, min_bytes_available, fake_lora_names); + UnloadFakeLoras(model_name, fake_lora_names); + return ret; +} + +AsStatus AsEngineImpl::StartModel(const char* model_name) { + util::as_init_log(); + + // start model + DLOG(INFO) << "[" << model_name << "] " + << "AsEngineImpl::StartModel"; + // create a new model + { + auto name = std::string(model_name); + as_stat_ = std::make_unique(name); + model_state_map_[name] = std::make_shared(name); + model_state_map_[name]->StartLoop(&AsEngineImpl::ModelRunningThread, this, + name, model_state_map_[name]); + } + +#if ENABLE_SPAN_ATTENTION + if (device_ctx_->GetDeviceType() == DeviceType::CUDA) { + if (device_ctx_->GetCacheSpanSize() == 0) { + LOG(INFO) << "StartModel: span cache is disabled, skip warm-up"; + return AsStatus::ALLSPARK_SUCCESS; + } } +#endif + + // start the thread pool + ExpandRankThreadPool(); + + // warmup + auto ret = WarmupModel(model_name); + return ret; } + void AsEngineImpl::ExpandRankThreadPool() { if (nranks_ > threadpool_size_) { threadpool_size_ = nranks_ * 2; @@ -1331,17 +1544,22 @@ AsStatus AsEngineImpl::StopModel(const char* model_name) { // TODO: this check is strange assert(model_state_map_[model_name].get() != nullptr); auto& model_state = model_state_map_[model_name]; + CHECK_MODEL_STOPPING(model_state); { - std::unique_lock lock(*(model_state->lock)); + model_state->model_stopping = true; auto msg = EngineControlMessage(EngineControlMessageId::GracefulStopModel, reply_promise); - model_state->msg_queue.enqueue(std::move(msg)); + + LOG(INFO) << "AsEngineImpl:: send model stop message."; + bool succ = model_state->msg_queue.enqueue(std::move(msg)); + if (!succ) { + LOG(ERROR) << "push message queue failed."; + } } model_state->cond_var->notify_all(); auto ret = reply_promise->get_future().get(); - model_state->model_stopping = true; if (ret != AsStatus::ALLSPARK_SUCCESS) { LOG(ERROR) << "[" << model_name << "] " @@ -1391,8 +1609,7 @@ AsStatus AsEngineImpl::ReleaseModel(const char* model_name) { model_state_map_.erase(model_name); } // bfc can reclaim all gpu memory that not in use. - // SweepBFCAllocator(); - DestroyBFCAllocator(); + DestroyDeviceContext(); return AsStatus::ALLSPARK_SUCCESS; // TODO; @@ -1527,6 +1744,20 @@ AsStatus AsEngineImpl::StartRequest( << (int)ret; return ret; } + auto lora_name = request_info->config.lora_name; + if (!lora_name.empty()) { + LOG(INFO) << "req lora_name=" << lora_name; + if (!workers_[0]->GetModel()->GetLoraManager()->IsLoraExists(lora_name)) { + LOG(ERROR) << "LoRA " << lora_name << " not found, cannot StartRequest!"; + return AsStatus::ALLSPARK_LORA_NOT_FOUND; + } + + std::lock_guard lora_guard(lora_usage_lock_); + if (!lora_name.empty()) { + lora_use_count_++; + loras_in_use_[model_name].insert(lora_name); + } + } auto reply_promise = std::make_shared>(); @@ -1721,7 +1952,6 @@ AsStatus AsEngineImpl::SyncRequest(const char* model_name, #endif if (request_handle) { // sync one request - std::unique_lock lock(*(model_state->lock)); uuid = request_handle->request_uuid; auto msg = EngineControlMessage(EngineControlMessageId::SyncRequest, reply_promise, uuid); @@ -1733,7 +1963,6 @@ AsStatus AsEngineImpl::SyncRequest(const char* model_name, uuid = ""; auto msg = EngineControlMessage(EngineControlMessageId::SyncAllRequest, reply_promise, uuid); - std::unique_lock lock(*(model_state->lock)); model_state->msg_queue.enqueue(std::move(msg)); } model_state->cond_var->notify_one(); @@ -1760,9 +1989,10 @@ AsStatus AsEngineImpl::SyncRequest(const char* model_name, AsStatus AsEngineImpl::LoadLoraByName(const char* model_name, const char* lora_name) { - DLOG(INFO) << "[" << model_name << "] " - << "LoadLoraByName: " << model_name << "::" << lora_name; std::lock_guard lora_guard(lora_lock_); + DLOG(INFO) << "before load_lora " << lora_name + << ", free space=" << workers_[0]->GetAvailableMemoryBytes(); + if (!lora_name || strlen(lora_name) == 0) { LOG(ERROR) << "[" << model_name << "] " << "LoadLoraByName: Invalid lora_name "; @@ -1770,12 +2000,15 @@ AsStatus AsEngineImpl::LoadLoraByName(const char* model_name, } if (workers_[0]->GetModel()->GetLoraManager()->IsLoraExists(lora_name)) { LOG(ERROR) << "LoRA " << lora_name << " already loaded, unload it first!"; - return AsStatus::ALLSPARK_REQUEST_DENIED; + return AsStatus::ALLSPARK_LORA_ALREADY_LOADED; } - if (loras_in_use_.count(model_name) && - loras_in_use_[model_name].count(lora_name)) { - LOG(ERROR) << "LoRA " << lora_name << " in use, cannot load!"; - return AsStatus::ALLSPARK_REQUEST_DENIED; + { + std::lock_guard lora_guard(lora_usage_lock_); + if (loras_in_use_.count(model_name) && + loras_in_use_[model_name].count(lora_name)) { + LOG(ERROR) << "LoRA " << lora_name << " in use, cannot load!"; + return AsStatus::ALLSPARK_LORA_IN_USE; + } } assert(model_state_map_[model_name].get() != nullptr); @@ -1794,7 +2027,9 @@ AsStatus AsEngineImpl::LoadLoraByName(const char* model_name, ret = result[i].get(); AS_CHECK_STATUS(ret); } - + LOG(INFO) << "after load_lora " << lora_name + << ", free space=" << workers_[0]->GetAvailableMemoryBytes(); + workers_[0]->GetModel()->GetLoraManager()->PrintLoras(); return ret; } @@ -1803,6 +2038,10 @@ AsStatus AsEngineImpl::UnloadLoraByName(const char* model_name, DLOG(INFO) << "[" << model_name << "] " << "UnloadLoraByName: " << model_name << "::" << lora_name; std::lock_guard lora_guard(lora_lock_); + LOG(INFO) << "before unload_lora " << lora_name + << ", free space=" << workers_[0]->GetAvailableMemoryBytes(); + workers_[0]->GetModel()->GetLoraManager()->PrintLoras(); + if (!lora_name || strlen(lora_name) == 0) { LOG(ERROR) << "[" << model_name << "] " << "UnloadLoraByName: Invalid lora_name "; @@ -1810,15 +2049,16 @@ AsStatus AsEngineImpl::UnloadLoraByName(const char* model_name, } if (!workers_[0]->GetModel()->GetLoraManager()->IsLoraExists(lora_name)) { LOG(ERROR) << "LoRA " << lora_name << " not found, cannot unload!"; - return AsStatus::ALLSPARK_REQUEST_DENIED; + return AsStatus::ALLSPARK_LORA_NOT_FOUND; } - /* Parallel control logic has moved to the caller framework, by YuChu - if (loras_in_use_.count(model_name) && - loras_in_use_[model_name].count(lora_name)) { - LOG(ERROR) << "LoRA " << lora_name << " in use, cannot unload!"; - return AsStatus::ALLSPARK_REQUEST_DENIED; + { + std::lock_guard lora_guard(lora_usage_lock_); + if (loras_in_use_.count(model_name) && + loras_in_use_[model_name].count(lora_name)) { + LOG(ERROR) << "LoRA " << lora_name << " in use, cannot unload!"; + return AsStatus::ALLSPARK_LORA_IN_USE; + } } - */ assert(model_state_map_[model_name].get() != nullptr); auto& model_state = model_state_map_[model_name]; @@ -1841,6 +2081,9 @@ AsStatus AsEngineImpl::UnloadLoraByName(const char* model_name, AS_CHECK_STATUS(ret); } + workers_[0]->GetModel()->GetLoraManager()->PrintLoras(); + LOG(INFO) << "after unload_lora " << lora_name + << ", free space=" << workers_[0]->GetAvailableMemoryBytes(); return ret; } @@ -2232,13 +2475,13 @@ AsStatus AsEngineImpl::StartRequestImpl( DLTensorMap* outputs, GenerateConfig& gen_cfg) { DLOG(INFO) << "[" << model_name << "] " << "AsEngineImpl::RunTextGeneration" << std::endl; - lock_guard_wrapper guard(engine_lock_); DLOG(INFO) << "[" << model_name << "] " << "AsEngineImpl::RunTextGeneration mutex lock passed" << std::endl; TensorMap out_tensors; + // TODO: alloc generated_ids on CPU std::string out_name = "generated_ids"; out_tensors.insert( {out_name, std::make_shared(out_name, DeviceType::CPU, @@ -2471,13 +2714,16 @@ void AsEngineImpl::UpdateResult( } // release lora_name from loras_in_use_ - //if (request->status == AsEngine::GenerateRequestStatus::GenerateFinished) { - // auto lora_name = request->gen_cfg.lora_name; - // std::lock_guard lora_guard(lora_usage_lock_); - // if (loras_in_use_.count(model_name) and !lora_name.empty()) { - // loras_in_use_[model_name].extract(lora_name); - // } - // } + if (request->status == AsEngine::GenerateRequestStatus::GenerateFinished) { + if (lora_use_count_.load() > 0) { + auto lora_name = request->gen_cfg.lora_name; + std::lock_guard lora_guard(lora_usage_lock_); + if (loras_in_use_.count(model_name) and !lora_name.empty()) { + loras_in_use_[model_name].extract(lora_name); + lora_use_count_--; + } + } + } } } @@ -2563,16 +2809,21 @@ void AsEngineImpl::ModelRunningThread( // decoupling message decoding phase and model execution phase bool no_execution = false; - int process_msg_size = 0; // Phase 1: message decoding // Pick one control message, handle control message, return control // message promise. // If synchronizing, block any message until finished. + util::Timer t_msg_handle; + +#ifndef ENABLE_CUDA + // this message counter is only for multi-numa syncing. + long processed_msg_counter = 0; +#endif + if (!synchronizing) { TracerLog t(device_ctx_->GetDeviceType(), "LoopHandleMsg", 3); - std::unique_lock lock(*(model_state->lock)); - int cur_msg_size = model_state->msg_queue.size(); + int cur_msg_size = model_state->msg_queue.size_approx(); // XXX: don't check message size, after change to concurrent queue, this // may not accurate. @@ -2580,16 +2831,52 @@ void AsEngineImpl::ModelRunningThread( // NOTE: the front is moved, do not use it anymore EngineControlMessage msg; - bool have_msg = model_state->msg_queue.try_dequeue(msg); - if (!have_msg) { - // skip messasge process stage. - break; + // When there are new messages, take them out, and if the size is + // greater than 0, process the new messages again. If there are no new + // messages, and sync_pending_set equals 0, and it is not in the middle + // of stop model, then skip message processing and continue executing + // the main loop. If there are no new messages and the above check + // fails, then enter the wait queue. + + if (model_state->msg_queue.size_approx() > 0) { + bool got_new_message = model_state->msg_queue.try_dequeue(msg); + if (!got_new_message) { + LOG(ERROR) << "queue size > 0, but not no message"; + goto skip_message_process; + } else { + DLOG(INFO) << "[" << model_name << "] " + << "ModelRunningThread: receive message: " + << ToString(msg.msg); + } + } else { + // message queue size == 0 + // TODO: add handle of gracefully stop. + if (sync_pending_set.size() != 0) { + LOG(INFO) << " pending syncing request, skip message process.."; + goto skip_message_process; + } + + // Note: get unfinished request will require one model running to + // change, so check the current management request too. + if (!graceful_stop_phase && + workers_[0]->GetUnFinishedRequest() == 0 && + model_state->request_handle_map.size() == 0 && + !model_state->model_stopping.load()) { + LOG(INFO) << "ModelRunningThread: EventLoop is going to suspend."; + + model_state->msg_queue.wait_dequeue(msg); + + LOG(INFO) << "ModelRunningThread: EventLoop is going to resume."; + + } else { + goto skip_message_process; + } } - DLOG(INFO) << "[" << model_name << "] " - << "ModelRunningThread: receive message: " - << ToString(msg.msg); + LOG(INFO) << "[" << model_name << "] " + << "ModelRunningThread: receive message: " + << ToString(msg.msg); // dispatch message switch (msg.msg) { @@ -2644,6 +2931,10 @@ void AsEngineImpl::ModelRunningThread( nullptr, msg.request->config); DLOG(INFO) << "[" << model_name << "] " << "RunTextGeneration finish " << t1.elapsed() << " ms"; + if (ret != AsStatus::ALLSPARK_SUCCESS) { + LOG(ERROR) << "ModelRunningThread: Start Request return failed: " + << " uuid: " << msg.request_uuid << " " << (int)ret; + } // start request don't wait for reply from client side code. // msg.promise->set_value(ret); @@ -2689,8 +2980,7 @@ void AsEngineImpl::ModelRunningThread( case EngineControlMessageId::ReleaseRequest: { LOG(INFO) << "[" << model_name << "]" - << "Release Request received: " - << msg.request_uuid; + << "Release Request received: " << msg.request_uuid; // before release , stop request // auto ret1 = this->StopRequestByRequestID(model_name.c_str(), // handle_ptr->request_uuid); @@ -2723,7 +3013,7 @@ void AsEngineImpl::ModelRunningThread( // check if it's the final msg if (graceful_stop_phase && workers_[0]->GetUnFinishedRequest() == 0 && - model_state->msg_queue.size() == 0) { + model_state->msg_queue.size_approx() == 0) { LOG(INFO) << "graceful_stop_phase: no unfinished request"; graceful_final_released = true; } @@ -2745,16 +3035,33 @@ void AsEngineImpl::ModelRunningThread( << (int)msg.msg; } } + +#ifndef ENABLE_CUDA + processed_msg_counter++; +#endif } - process_msg_size = cur_msg_size - model_state->msg_queue.size(); } else { DLOG(INFO) << "[" << model_name << "] " << "skipping message handling due to synchronization"; } + skip_message_process: + + util::Timer t_msg_handle_finish; + + auto msg_handle_time_ms = + util::Timer::duration_ms(t_msg_handle, t_msg_handle_finish); + + if (msg_handle_time_ms > 1) { + // usually message handle cost around 0.01-0.1 ms + LOG(INFO) + << "ModelRunningThread: message handle cost too much, time(ms): " + << msg_handle_time_ms; + } + #ifndef ENABLE_CUDA no_execution = workers_[0]->GetDeviceContext()->SemWaitMsgSynInterProcess( - process_msg_size); + processed_msg_counter); #endif // Phase 2: model execution @@ -2859,7 +3166,6 @@ void AsEngineImpl::ModelRunningThread( // Step 2.2: get every running request and put result to the queue. { TracerLog t(device_ctx_->GetDeviceType(), "Loop:Update", 4); - std::unique_lock lock(*(model_state->lock)); // LOG(INFO) << "[" << model_name << "] " // << "now handle_size = " // << model_state->request_handle_map.size(); @@ -2887,8 +3193,11 @@ void AsEngineImpl::ModelRunningThread( // thread will wait on the cond var if the msg queue size is 0. } } + + // if there is no message unfinished task, check whether can enter stop + // state. if (workers_[0]->GetUnFinishedRequest() == 0 && - model_state->msg_queue.size() == 0) { + model_state->msg_queue.size_approx() == 0) { // check if in GracefulStopModel phase if (graceful_stop_phase) { LOG(INFO) << "Enter graceful stop phase."; @@ -2910,28 +3219,17 @@ void AsEngineImpl::ModelRunningThread( "!!!"; graceful_stop_msg.promise->set_value( AsStatus::ALLSPARK_RUNTIME_ERROR); - std::unique_lock lock(*(model_state->lock)); model_state->model_stopped = true; goto loop_end; // use goto is more clear going to end of loop. } if (graceful_final_released) { graceful_stop_msg.promise->set_value(AsStatus::ALLSPARK_SUCCESS); - std::unique_lock lock(*(model_state->lock)); model_state->model_stopped = true; DLOG(INFO) << "All done, gracefully stopped!"; goto loop_end; } } // there is no running task , put our thread into wait. - std::unique_lock lock(*(model_state->lock)); - LOG(INFO) << "[" << model_name << "] " - << "No Running Request, No Control Message, main thread put " - "into sleep: unfnished task: " - << workers_[0]->GetUnFinishedRequest(); - model_state->cond_var->wait(lock, [this, model_state]() { - return this->workers_[0]->GetUnFinishedRequest() != 0 || - model_state->msg_queue.size() > 0; - }); } // if no control message and no running task, wait on task. } @@ -3119,11 +3417,11 @@ bool AsEngine::IsAllSparkWorkAsService() { AsModelConfig::AsModelConfig() { #ifdef ENABLE_CUDA prefill_mode = AsMHAPrefill::AsPrefillXformer; -#else +#else // ENABLE_CUDA prefill_mode = AsMHAPrefill::AsPrefillDefault; -#endif +#endif // ENABLE_CUDA - cache_span_size = 16; + cache_span_size = default_span_size; } AsModelConfig::AsModelConfig( @@ -3137,7 +3435,8 @@ AsModelConfig::AsModelConfig( bool enable_prefix_cache, int prefix_cache_ttl, AsMHAPrefill in_prefill_mode, AsCacheMode in_cache_mode, AsEvictionStrategy in_eviction_strategy, - AsSchedulingStrategy in_scheduling_strategy, bool enable_sparsity_matmul) + AsSchedulingStrategy in_scheduling_strategy, bool enable_sparsity_matmul, + int lora_max_rank, int lora_max_num) : model_name(std::move(in_model_name)), model_path(std::move(in_model_path)), weights_path(std::move(in_weights_path)), @@ -3159,14 +3458,16 @@ AsModelConfig::AsModelConfig( eviction_strategy(in_eviction_strategy), text_graph(in_text_graph), scheduling_strategy(in_scheduling_strategy), - enable_sparsity_matmul(enable_sparsity_matmul) { + enable_sparsity_matmul(enable_sparsity_matmul), + lora_max_rank(lora_max_rank), + lora_max_num(lora_max_num) { // replace the defualt setting in header. if (in_prefill_mode == AsMHAPrefill::AsPrefillDefault) { #ifdef ENABLE_CUDA prefill_mode = AsMHAPrefill::AsPrefillXformer; -#else +#else // ENABLE_CUDA prefill_mode = AsMHAPrefill::AsPrefillDefault; -#endif +#endif // ENABLE_CUDA } } @@ -3231,12 +3532,11 @@ std::string AsModelConfig::ToString() const { #else result += std::string("\tcache_span_size = ") + std::to_string(cache_span_size) + "\n"; - +#endif // FIXED_SPAN_SIZE result += std::string("\tcache_span_num_init = ") + std::to_string(cache_span_num_init) + "\n"; result += std::string("\tcache_span_num_grow = ") + std::to_string(cache_span_num_grow) + "\n"; -#endif // FIXED_SPAN_SIZE result += std::string("\tenable_prefix_cache = ") + std::to_string(enable_prefix_cache) + "\n"; result += std::string("\tprefix_cache_ttl = ") + @@ -3246,6 +3546,10 @@ std::string AsModelConfig::ToString() const { std::to_string(swap_threshold) + "\n"; result += std::string("\tenable_sparsity_matmul = ") + std::to_string(enable_sparsity_matmul) + "\n"; + result += + std::string("\tlora_max_rank= ") + std::to_string(lora_max_rank) + "\n"; + result += + std::string("\tlora_max_num= ") + std::to_string(lora_max_num) + "\n"; return result; } diff --git a/csrc/common/device_context.h b/csrc/common/device_context.h index 4a3c8ab3..41729672 100644 --- a/csrc/common/device_context.h +++ b/csrc/common/device_context.h @@ -23,6 +23,7 @@ class DeviceContext { public: DeviceContext() = default; virtual ~DeviceContext() {} + virtual void Init() = 0; virtual DeviceType GetDeviceType() const = 0; virtual int GetRank() const = 0; virtual int GetNranks() const = 0; @@ -68,6 +69,9 @@ class DeviceContext { void SetDecoderLayer(int dec_layer) { dec_layer_ = dec_layer; } void SetSizePerHead(int size_per_head) { size_per_head_ = size_per_head; } void SetNumberGroups(int num_groups) { num_groups_ = num_groups; } + void SetIntermediateSize(int intermediate_size) { + intermediate_size_ = intermediate_size; + } void SetUseTorchSample(bool use_torch_sample) { use_torch_sample_ = use_torch_sample; } @@ -127,6 +131,7 @@ class DeviceContext { int GetDecoderLayer() const { return dec_layer_; } int GetSizePerHead() const { return size_per_head_; } int GetNumberGroups() const { return num_groups_; } + int GetIntermediateSize() const { return intermediate_size_; } int GetMaxTopLogprobs() const { return engine_max_top_logprobs; } bool GetUseTorchSample() const { return use_torch_sample_; } AsMHAPrefill GetPrefillMode() const { return prefill_mode_; } @@ -134,6 +139,12 @@ class DeviceContext { AsSchedulingStrategy GetSchedulingStrategy() const { return scheduling_strategy_; } + int GetLoraMaxNum() const { return lora_max_num_; } + void SetLoraMaxNum(int lora_max_num) { lora_max_num_ = lora_max_num; } + int GetLoraMaxRank() const { return lora_max_rank_; } + void SetLoraMaxRank(int lora_max_rank) { lora_max_rank_ = lora_max_rank; } + bool GetLoraEnabled() const { return lora_enabled_; } + void SetLoraEnabled(bool enabled) { lora_enabled_ = enabled; } // decoder fallback weight only which is used in A8W8 bool GetFallbackDecoderWeightOnly() const { return decoder_weight_only_; } void CopyFromOther(const DeviceContext* other_ctx) { @@ -146,6 +157,7 @@ class DeviceContext { SetDecoderLayer(other_ctx->GetDecoderLayer()); SetSizePerHead(other_ctx->GetSizePerHead()); SetNumberGroups(other_ctx->GetNumberGroups()); + SetIntermediateSize(other_ctx->GetIntermediateSize()); SetUseTorchSample(other_ctx->GetUseTorchSample()); SetKVcacheSize(other_ctx->GetKVcacheSize()); SetModelMaxLength(other_ctx->GetModelMaxLength()); @@ -157,6 +169,9 @@ class DeviceContext { SetEvictionStrategy(other_ctx->GetEvictionStrategy()); SetFallbackDecoderWeightOnly(other_ctx->GetFallbackDecoderWeightOnly()); SetSparsityMatmulMode(other_ctx->GetSparsityMatmulMode()); + SetLoraMaxNum(other_ctx->GetLoraMaxNum()); + SetLoraMaxRank(other_ctx->GetLoraMaxRank()); + SetLoraEnabled(other_ctx->GetLoraEnabled()); } private: @@ -171,6 +186,7 @@ class DeviceContext { int dec_layer_ = 0; int size_per_head_ = 0; int num_groups_ = 0; + int intermediate_size_ = 0; bool use_torch_sample_ = false; // fallback to A16Wx in decoder phase for A8WX or AF8Wx quantized gemm if // decoder_weight_only_ is true. @@ -178,12 +194,15 @@ class DeviceContext { int precision_ = PrecisionLevel::HIGHEST; #ifdef ENABLE_CUDA AsMHAPrefill prefill_mode_ = AsMHAPrefill::AsPrefillXformer; -#else +#else // ENABLE_CUDA AsMHAPrefill prefill_mode_ = AsMHAPrefill::AsPrefillDefault; -#endif +#endif // ENABLE_CUDA AsEvictionStrategy eviction_strategy_ = AsEvictionStrategy::MaxLength; AsSchedulingStrategy scheduling_strategy_ = AsSchedulingStrategy::ContextPriority; + bool lora_enabled_ = false; + int lora_max_num_ = 2; + int lora_max_rank_ = 64; protected: DataType dtype = DataType::DATATYPE_UNDEFINED; diff --git a/csrc/common/engine_runtime.h b/csrc/common/engine_runtime.h index 160724ca..48b6d1b8 100644 --- a/csrc/common/engine_runtime.h +++ b/csrc/common/engine_runtime.h @@ -16,10 +16,11 @@ #include "allspark.h" #include "core/tensor/tensor.h" #include "engine_control_message.h" +#include "utility/blockingconcurrentqueue.h" #include "utility/concurrentqueue.h" /// @brief if defined, user input span size will be ignored -#define FIXED_SPAN_SIZE 128 +// #define FIXED_SPAN_SIZE 128 /// @brief if defined, span managers run concurrently #define CONFIG_CONCURRENT_SPAN @@ -44,9 +45,9 @@ class ModelControlState final { public: std::string model_name; - moodycamel::ConcurrentQueue msg_queue; + moodycamel::BlockingConcurrentQueue msg_queue; + std::atomic msg_queue_size; - std::unique_ptr lock; std::unique_ptr cond_var; std::unordered_map> @@ -55,8 +56,9 @@ class ModelControlState final { result_queue_map; std::queue> release_request_handle; std::queue> release_request_queue; - bool model_stopping = false; // after GracefulStopModel called... - bool model_stopped = false; // after GracefulStopModel is done. + std::atomic model_stopping = + false; // after GracefulStopModel called... + std::atomic model_stopped = false; // after GracefulStopModel is done. std::shared_ptr weight_handler_; explicit ModelControlState(const std::string& name); @@ -68,6 +70,11 @@ class ModelControlState final { } void StopLoop(); + + private: + // 将拷贝构造函数和拷贝赋值运算符声明为私有 + ModelControlState(const ModelControlState&); + ModelControlState& operator=(const ModelControlState&); }; class AsTensor; diff --git a/csrc/common/engine_worker.cpp b/csrc/common/engine_worker.cpp index a083915e..a71542d6 100644 --- a/csrc/common/engine_worker.cpp +++ b/csrc/common/engine_worker.cpp @@ -16,26 +16,46 @@ namespace allspark { #ifdef ENABLE_CUDA + +thread_local int CudaWorker::last_device_id_of_this_thread_ = -1; + CudaWorker::CudaWorker(int rank, int nranks, const ncclUniqueId& id, int device_id) : Worker(rank, nranks, device_id), nccl_id_(id) { - SetWorkerDeviceId(device_id_); + // don't call virtual function in constructor. device_ctx_ = std::make_unique(); device_ctx_->SetDeviceId(device_id_); } + +void CudaWorker::Init() { SetWorkerDeviceId(device_id_); } + AsStatus CudaWorker::InitCCL(int rank, int nranks) { CUDAContext* cu_ctx_ = (CUDAContext*)(device_ctx_.get()); SetWorkerDeviceId(device_id_); cu_ctx_->InitNCCL(rank, nccl_id_, nranks_); return AsStatus::ALLSPARK_SUCCESS; } -void CudaWorker::SetWorkerDeviceId(int device_id) { cudaSetDevice(device_id); } +void CudaWorker::SetWorkerDeviceId(int device_id) { + DLOG(INFO) << "set worker device id: " << device_id + << " local value: " << last_device_id_of_this_thread_; + if (last_device_id_of_this_thread_ == device_id) { + return; + } else { + cudaSetDevice(device_id); + last_device_id_of_this_thread_ = device_id; + } +} + +CudaWorker::~CudaWorker() { last_device_id_of_this_thread_ = -1; } #endif CpuWorker::CpuWorker(int rank, int nranks, int device_id) : Worker(rank, nranks, device_id) { device_ctx_ = std::make_unique(); } + +void CpuWorker::Init() {} + AsStatus CpuWorker::InitCCL(int rank, int nranks) { #ifdef ENABLE_MULTINUMA CPUContext* cpu_ctx = (CPUContext*)(device_ctx_.get()); diff --git a/csrc/common/engine_worker.h b/csrc/common/engine_worker.h index 5f15f683..5d68c629 100644 --- a/csrc/common/engine_worker.h +++ b/csrc/common/engine_worker.h @@ -18,6 +18,8 @@ class Worker { : rank_(rank), nranks_(nranks), device_id_(device_id) {} virtual ~Worker() {} + virtual void Init() = 0; + virtual AsStatus InitCCL(int rank, int nranks) = 0; virtual void SetWorkerDeviceId(int device_id) = 0; @@ -100,16 +102,20 @@ class Worker { class CudaWorker : public Worker { public: CudaWorker(int rank, int nranks, const ncclUniqueId& id, int device_id); + virtual ~CudaWorker(); void SetWorkerDeviceId(int device_id) override; + void Init() override; AsStatus InitCCL(int rank, int nranks) override; private: const ncclUniqueId nccl_id_; + thread_local static int last_device_id_of_this_thread_; }; #endif class CpuWorker : public Worker { public: CpuWorker(int rank, int nranks, int device_id); + void Init() override; void SetWorkerDeviceId(int device_id) override{}; AsStatus InitCCL(int rank, int nranks) override; AsStatus SetNumThreads(int nums) override; diff --git a/csrc/common/result_queue.cpp b/csrc/common/result_queue.cpp index 616d76f6..bc26de62 100644 --- a/csrc/common/result_queue.cpp +++ b/csrc/common/result_queue.cpp @@ -132,21 +132,17 @@ ResultQueueImpl::GenerateElementPtr ResultQueueImpl::Get() { return ret; } -static -void drainAllElements(moodycamel::ConcurrentQueue &queue, - ResultQueueImpl::GenerateElementPtr output) { - +static void drainAllElements( + moodycamel::ConcurrentQueue& queue, + ResultQueueImpl::GenerateElementPtr output) { bool have_ele = false; while (true) { AsEngine::GeneratedElements one_new_token; have_ele = queue.try_dequeue(one_new_token); - if (!have_ele) - break; + if (!have_ele) break; output->AppendNewSingleElementToEnd(one_new_token); } - - } // wait for new data or new status. @@ -189,7 +185,6 @@ ResultQueueImpl::GenerateElementPtr ResultQueueImpl::GetWithTimeout( } drainAllElements(store_queue_, total_elements); } else { - // no new token. if (status_ == AsEngine::GenerateRequestStatus::GenerateFinished || status_ == AsEngine::GenerateRequestStatus::GenerateInterrupted || @@ -197,18 +192,17 @@ ResultQueueImpl::GenerateElementPtr ResultQueueImpl::GetWithTimeout( // return nullptr, or throw exception? get_user--; - // if state changed, we should drain the queue in case some missing token. - // since user may not check the queue again. - // sometimes lockless queue don't return all write in one fetch. + // if state changed, we should drain the queue in case some missing + // token. since user may not check the queue again. sometimes lockless + // queue don't return all write in one fetch. drainAllElements(store_queue_, total_elements); return total_elements; } else { - // if in generating state, return token now. if (total_elements && total_elements->ids_from_generate.size() > 0) { get_user--; return total_elements; - } + } spin_count++; diff --git a/csrc/common/thread_pool_with_id.h b/csrc/common/thread_pool_with_id.h index 43e8dc88..92332b40 100644 --- a/csrc/common/thread_pool_with_id.h +++ b/csrc/common/thread_pool_with_id.h @@ -21,42 +21,44 @@ #include #include +#include "../utility/blockingconcurrentqueue.h" #include "thread_utils.h" + +// #define THREAD_POOL_FEATURE_NO_BLOCK + namespace allspark { +// spin lock version for faster execute. class ThreadPoolWithID { + constexpr static bool feature_use_spin_queue = true; + public: - ThreadPoolWithID(size_t threads) - : task_queues(threads), - cond_vec(threads), - mutex_vec(threads), - stop(false) { + ThreadPoolWithID(size_t threads) : queues(threads), stop(false) { for (size_t i = 0; i < threads; ++i) + workers.emplace_back([this, i] { - setThreadName(i, "ASThreadPool"); + setThreadName(i, "ASWorker"); for (;;) { std::function task; - { - unique_lock_wrapper lock(this->mutex_vec[i], - "cond.wait"); - this->cond_vec[i].wait(lock.unique_lock_, [this, i] { - return this->stop || !this->task_queues[i].empty(); - }); + if (this->stop.load()) { + LOG(INFO) << "Thread Pool with id: " << i << " Exit!!!"; + return; + } - if (this->stop && this->task_queues[i].empty()) { - LOG(INFO) << "Thread Pool with id: " << i << " Exit!!!"; - return; - } +#ifdef THREAD_POOL_FEATURE_NO_BLOCK + bool have_new_value = this->queues[i].try_dequeue(task); + if (!have_new_value) + continue; + else { + task(); + } - { - // need lock? - task = std::move(this->task_queues[i].front()); +#else + this->queues[i].wait_dequeue(task); - this->task_queues[i].pop(); - } - } task(); +#endif } }); } @@ -65,7 +67,8 @@ class ThreadPoolWithID { auto enqueue(size_t worker_id, F&& f, Args&&... args) -> std::future::type> { using return_type = typename std::result_of::type; - if (worker_id >= this->task_queues.size()) + + if (worker_id >= this->queues.size()) throw std::runtime_error("worker submit exceeds thread pool size."); auto task = std::make_shared>( @@ -76,18 +79,16 @@ class ThreadPoolWithID { if (stop) { throw std::runtime_error("enqueue on stopped ThreadPool"); } - unique_lock_wrapper lock(this->mutex_vec[worker_id]); - this->task_queues[worker_id].emplace([task]() { (*task)(); }); + this->queues[worker_id].enqueue([task]() { (*task)(); }); } - cond_vec[worker_id].notify_all(); return res; } ~ThreadPoolWithID() { - stop = true; + stop.store(true); - for (auto& cond : cond_vec) { - cond.notify_all(); + for (auto& q : queues) { + q.enqueue([]() { LOG(INFO) << "dummy message for wake up."; }); } for (std::thread& worker : workers) { @@ -96,13 +97,14 @@ class ThreadPoolWithID { } private: - std::vector workers; - std::vector>> task_queues; - - std::vector cond_vec; - std::vector mutex_vec; +#ifdef THREAD_POOL_FEATURE_NO_BLOCK + std::vector>> queues; +#else + std::vector>> + queues; +#endif - std::mutex queue_mutex; // mutex put task into queue. + std::vector workers; std::atomic stop; }; diff --git a/csrc/core/kernel/CMakeLists.txt b/csrc/core/kernel/CMakeLists.txt index 2657c743..8cda9a95 100644 --- a/csrc/core/kernel/CMakeLists.txt +++ b/csrc/core/kernel/CMakeLists.txt @@ -1,12 +1,12 @@ -set(KERNEL_INC_DIR - ${CMAKE_CURRENT_SOURCE_DIR} - ${THREAD_INCLUDE} - ${PROJECT_SOURCE_DIR}/csrc - ${PROJECT_SOURCE_DIR}/csrc/common - ${PROJECT_SOURCE_DIR}/csrc/device - ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass/include - ${CMAKE_BINARY_DIR}/csrc -) + set(KERNEL_INC_DIR + ${CMAKE_CURRENT_SOURCE_DIR} + ${THREAD_INCLUDE} + ${PROJECT_SOURCE_DIR}/csrc + ${PROJECT_SOURCE_DIR}/csrc/common + ${PROJECT_SOURCE_DIR}/csrc/device + ${PROJECT_SOURCE_DIR}/third_party/from_source/cutlass/include + ${CMAKE_BINARY_DIR}/csrc + ) if(NOT ALLSPARK_CBLAS MATCHES "NONE") list(APPEND KERNEL_INC_DIR ${CBLAS_INCLUDE_DIR}) @@ -115,8 +115,8 @@ if (ENABLE_CUDA) ) list(APPEND CUDA_COMMON_KERNEL_SRC ${CUDA_COMMON_KERNEL_SRC_TMP}) - file(GLOB_RECURSE CUDA_XFORMER_SRC cuda/xformer_mha/*) - file(GLOB_RECURSE CUDA_MOE_SRC cuda/moe/*.cu) + file(GLOB_RECURSE CUDA_XFORMER_SRC cuda/xformer_mha/*) + file(GLOB_RECURSE CUDA_MOE_SRC cuda/moe/*.cu) list(APPEND KERNEL_SRC ${CUDA_COMMON_KERNEL_SRC} ${CUDA_XFORMER_SRC} ${CUDA_MOE_SRC}) endif() @@ -138,6 +138,7 @@ message(STATUS "ALLSPARK_PUBLIC_DEFINITIONS:${ALLSPARK_PUBLIC_DEFINITIONS}") add_library(allspark_kernel STATIC ${KERNEL_SRC}) set_target_properties(allspark_kernel PROPERTIES CXX_STANDARD ${CXX_STD}) + target_compile_definitions( allspark_kernel PRIVATE ${ALLSPARK_PUBLIC_DEFINITIONS} @@ -150,3 +151,9 @@ target_compile_options(allspark_kernel PUBLIC $<$:${ALLSPA if(ALLSPARK_CBLAS MATCHES "BLIS") add_dependencies(allspark_kernel project_blis) endif() + +if (ENABLE_CUDA) + if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.8") + add_dependencies(allspark_kernel project_flashattn) + endif() +endif() diff --git a/csrc/core/kernel/cuda/cuda_kernel.h b/csrc/core/kernel/cuda/cuda_kernel.h index eb7de6a5..d6d9b4a3 100644 --- a/csrc/core/kernel/cuda/cuda_kernel.h +++ b/csrc/core/kernel/cuda/cuda_kernel.h @@ -58,6 +58,16 @@ void StridedBatchGemmWraper(T* matrix_C, const T* matrix_A, const T* matrix_B, bool transB, int lda, int ldb, int ldc, float alpha, float beta, int batch, const T* bin_res, cublasHandle_t handle, cudaStream_t stream); +template +bool SgmvCutlass(DType* y, const DType* x, const DType** w, const int32_t* s, + const int32_t* ranks, void* tmp_d, bool is_k_tensor, + bool is_n_tensor, int num_problems, int d_in, int d_out, + bool unsplit, int unsplit_n, int max_rank, int CC, + cudaStream_t stream); +template +bool SgmvSplitQKV(DType** out_ptrs, const DType* in, const int32_t* s, + const int32_t* lora_B_ranks, int max_rank, int num_problems, + cudaStream_t stream); template void SoftmaxKernelLauncher(T* qk_buf, const float* mask, int batch_size, int beam_size, int num_heads, int seq_len, int step, diff --git a/csrc/core/kernel/cuda/cuda_util.h b/csrc/core/kernel/cuda/cuda_util.h new file mode 100644 index 00000000..99db5d0d --- /dev/null +++ b/csrc/core/kernel/cuda/cuda_util.h @@ -0,0 +1,14 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file cuda_util.h + */ +#pragma once + +namespace allspark { +namespace cuda_util { + +template +void CheckInfNan(const T* d_data, size_t size); + +} +} // namespace allspark diff --git a/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_kernel.h b/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_kernel.h index 1cf8c948..391e6c94 100644 --- a/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_kernel.h +++ b/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_kernel.h @@ -152,6 +152,7 @@ void ampere_hgemm_A16W8_perc_f16_f16_64x128x32_mma16816_multistage_nonfused_spli const uint32_t K, void* workspace, const int sm_version, const SplitKParams splitk_params, const float alpha, cudaStream_t stream); + /** * @brief * diff --git a/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu b/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu index c5e016db..a2afdc3a 100644 --- a/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu +++ b/csrc/core/kernel/cuda/gemm_lowp/gemm_a16w8_perc_kernel.cu @@ -1555,8 +1555,8 @@ void ampere_hgemm_A16W8_perc_f16_f16_64x128x32_mma16816_multistage_nonfused_spli grid_z, alpha, stream); } -static constexpr int WARP_SIZE = 32; +// Rearrange B to facilitate Ampere Tensor Core load data // reorder B from (K, N) to (N_32align / 4, K * 4) // K % 16 == 0, N % 16 == 0, N_32align % 32 == 0 template @@ -1817,6 +1817,7 @@ ampere_hgemm_A16W8_perc_f16_f16_64x128x32_mma16816_multistage_nonfused_splitk< const SplitKParams, const float, cudaStream_t); + template void rearrange_kn_weight_as_n32k16_order_ldg16( const int8_t* B, const half* B_scale, const half* B_zero, int8_t* B_result, half* B_scale_result, half* B_zero_result, const int K, const int N, diff --git a/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_common.h b/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_common.h index cacdbefe..639f29b4 100644 --- a/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_common.h +++ b/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_common.h @@ -80,6 +80,7 @@ struct SM8x_GEMM_A16W8_Params { TileSchedule schedule_mn; }; + template struct GEMM_A16W8_Params { const FType* A_ptr; diff --git a/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh b/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh index 8e0246d8..17764cf3 100644 --- a/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh +++ b/csrc/core/kernel/cuda/gemm_lowp/gemm_lowp_utils.cuh @@ -866,6 +866,7 @@ __device__ __forceinline__ void mma_h884(const uint32_t& a0, const uint32_t& a1, #endif } + static constexpr int WARP_SIZE = 32; /** diff --git a/csrc/core/kernel/cuda/moe/moe_kernel.cu b/csrc/core/kernel/cuda/moe/moe_kernel.cu index dc0d8029..fbf264f0 100644 --- a/csrc/core/kernel/cuda/moe/moe_kernel.cu +++ b/csrc/core/kernel/cuda/moe/moe_kernel.cu @@ -21,8 +21,8 @@ void GetWorkspaceSize(size_t* hostWsSize, size_t* deviceWsSize, uint32_t m, template <> void MoeBatchedGemmLauncher( const float* A, const float* B, const uint32_t* matBIndices, float* C, - uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, void* deviceWs, - size_t deviceWsSize, uint32_t matARows, uint32_t n, uint32_t k, + uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, void* + deviceWs, size_t deviceWsSize, uint32_t matARows, uint32_t n, uint32_t k, uint32_t nMatB, uint32_t nMatBPerMatARow, cudaStream_t stream) { // TODO } @@ -30,8 +30,8 @@ void MoeBatchedGemmLauncher( template <> void MoeBatchedGemmLauncher( const half* A, const half* B, const uint32_t* matBIndices, half* C, - uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, void* deviceWs, - size_t deviceWsSize, uint32_t matARows, uint32_t n, uint32_t k, + uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, void* + deviceWs, size_t deviceWsSize, uint32_t matARows, uint32_t n, uint32_t k, uint32_t nMatB, uint32_t nMatBPerMatARow, cudaStream_t stream) { // TODO } @@ -39,10 +39,11 @@ void MoeBatchedGemmLauncher( #ifdef ENABLE_BF16 template <> void MoeBatchedGemmLauncher( - const hie::bfloat16* A, const hie::bfloat16* B, const uint32_t* matBIndices, - hie::bfloat16* C, uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, - void* deviceWs, size_t deviceWsSize, uint32_t matARows, uint32_t n, - uint32_t k, uint32_t nMatB, uint32_t nMatBPerMatARow, cudaStream_t stream) { + const hie::bfloat16* A, const hie::bfloat16* B, const uint32_t* + matBIndices, hie::bfloat16* C, uint32_t* matCRowIndices, void* hostWs, + size_t hostWsSize, void* deviceWs, size_t deviceWsSize, uint32_t + matARows, uint32_t n, uint32_t k, uint32_t nMatB, uint32_t + nMatBPerMatARow, cudaStream_t stream) { // TODO } #endif diff --git a/csrc/core/kernel/cuda/moe/moe_kernel.h b/csrc/core/kernel/cuda/moe/moe_kernel.h index 5e3fb256..5d683a4f 100644 --- a/csrc/core/kernel/cuda/moe/moe_kernel.h +++ b/csrc/core/kernel/cuda/moe/moe_kernel.h @@ -16,11 +16,12 @@ void GetWorkspaceSize(size_t* hostWsSize, size_t* deviceWsSize, uint32_t m, uint32_t nMatB); template -void MoeBatchedGemmLauncher(const T* A, const T* B, const uint32_t* matBIndices, +void MoeBatchedGemmLauncher(const T* A, const T* B, const uint32_t* +matBIndices, T* C, uint32_t* matCRowIndices, void* hostWs, size_t hostWsSize, void* deviceWs, - size_t deviceWsSize, uint32_t matARows, uint32_t n, - uint32_t k, uint32_t nMatB, - uint32_t nMatBPerMatARow, cudaStream_t stream); + size_t deviceWsSize, uint32_t matARows, uint32_t + n, uint32_t k, uint32_t nMatB, uint32_t + nMatBPerMatARow, cudaStream_t stream); } // namespace cuda } // namespace allspark \ No newline at end of file diff --git a/csrc/core/kernel/cuda/sgmv.cu b/csrc/core/kernel/cuda/sgmv.cu new file mode 100644 index 00000000..ea2b2e6f --- /dev/null +++ b/csrc/core/kernel/cuda/sgmv.cu @@ -0,0 +1,238 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file sgmv.cu + */ + +/* + * SgmvCutlass source code copied from: + * Punica https://github.com/punica-ai/punica + */ + +#include +#include + +#include + +#include "cuda_kernel.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_grouped.h" +#include "cutlass/gemm/kernel/default_gemm_grouped.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" + +namespace allspark { +namespace cuda { + +template +struct cutlass_dtype { + using type = T; +}; + +template <> +struct cutlass_dtype { + using type = cutlass::half_t; +}; + +template <> +struct cutlass_dtype { + using type = cutlass::bfloat16_t; +}; + +template +__global__ void precompute_sgmv_args( + cutlass::gemm::GemmCoord* all_problems, T** ptr_y, T** ptr_x, T** ptr_w, + int64_t* ld_y, int64_t* ld_x, int64_t* ld_w, T* y, T* x, T** w, + const int32_t* s, const int32_t* ranks, bool is_k_tensor, bool is_n_tensor, + int d_in, int d_out, bool unsplit, int unsplit_n, int max_rank) { + int i = blockIdx.x; + int m = s[i * 2 + 1] - s[i * 2]; + int k = d_in; + int n = ranks[i]; + if (is_k_tensor == true) { + // lora_B + k = ranks[i]; + n = d_out; + if (unsplit == true) { + // attention.self + ptr_x[i] = x + s[i * 2] * (max_rank / 3); + ptr_y[i] = y + s[i * 2] * unsplit_n; + } else { + ptr_x[i] = x + s[i * 2] * max_rank; + ptr_y[i] = y + s[i * 2] * n; + } + } else if (is_n_tensor == true) { + // lora_A + k = d_in; + n = ranks[i]; + ptr_x[i] = x + s[i * 2] * k; + ptr_y[i] = y + s[i * 2] * max_rank; + } + // 已经在父函数处理了else的异常情况 + + all_problems[i] = cutlass::gemm::GemmCoord(m, n, k); + ptr_w[i] = w[i]; + if (unsplit == true) { + ld_x[i] = k; + ld_w[i] = unsplit_n; + ld_y[i] = unsplit_n; + } else { + ld_x[i] = k; + ld_w[i] = n; + ld_y[i] = n; + } +} + +size_t sgmv_tmp_size(int num_problems) { + constexpr auto sz = sizeof(void*) * 3 + sizeof(int64_t) * 3 + + sizeof(cutlass::gemm::GemmCoord); + return sz * num_problems; +} + +template +inline T* alloc_from_buf(void** buf, int n) { + auto* p = (T*)*buf; + *buf = (void*)(p + n); + return p; +} + +template +bool SgmvCutlass(DType* y, const DType* x, const DType** w, const int32_t* s, + const int32_t* ranks, void* tmp_d, bool is_k_tensor, + bool is_n_tensor, int num_problems, int d_in, int d_out, + bool unsplit, int unsplit_n, int max_rank, int CC, + cudaStream_t stream) { + using cutlass_t = typename cutlass_dtype::type; + + auto ptr_Y = alloc_from_buf(&tmp_d, num_problems); + auto ptr_X = alloc_from_buf(&tmp_d, num_problems); + auto ptr_W = alloc_from_buf(&tmp_d, num_problems); + auto ld_Y = alloc_from_buf(&tmp_d, num_problems); + auto ld_X = alloc_from_buf(&tmp_d, num_problems); + auto ld_W = alloc_from_buf(&tmp_d, num_problems); + auto all_problems = + alloc_from_buf(&tmp_d, num_problems); + AS_ENFORCE((is_k_tensor == true || is_n_tensor == true) && + (is_k_tensor != is_n_tensor)); + AS_ENFORCE((unsplit == false) || (is_k_tensor == unsplit)); + + precompute_sgmv_args<<>>( + all_problems, ptr_Y, ptr_X, ptr_W, ld_Y, ld_X, ld_W, (cutlass_t*)y, + (cutlass_t*)x, (cutlass_t**)w, s, ranks, is_k_tensor, is_n_tensor, d_in, + d_out, unsplit, unsplit_n, max_rank); + + using cutlass::epilogue::thread::LinearCombination; + using cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; + using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped< + cutlass_t, // Element A + cutlass::layout::RowMajor, // Layout A + cutlass::ComplexTransform::kNone, // + 8, // Granularity A + cutlass_t, // Element B + cutlass::layout::RowMajor, // Layout B + cutlass::ComplexTransform::kNone, // + 8, // Granularity B + cutlass_t, // Element C&D + cutlass::layout::RowMajor, // Layout C&D + float, // Element Accumulator + cutlass::arch::OpClassTensorOp, // Operator Class Tag + cutlass::arch::Sm80, // Architecture + cutlass::gemm::GemmShape<32, 64, 64>, // Thread Block Shape + cutlass::gemm::GemmShape<16, 32, 64>, // Warp Shape + cutlass::gemm::GemmShape<16, 8, 16>, // Instruction Shape + LinearCombination, // Epilogue + GemmIdentityThreadblockSwizzle<>, // Swizzling Operator + 6 // Stages + >::GemmKernel; + + using EpilogueOutputOp = typename GemmKernel::Epilogue::OutputOp; + typename EpilogueOutputOp::Params epilogue_op(1.0, 0.0); + // alpha: 1.0, beta: 0.0 + // cutlass gemm operation: ptr_Y = alpha * ptr_X * ptr_W + beta * ptr_Y + // ptr_Y[i] is not set to zero so beta must be 0.0 + + using GemmGrouped = cutlass::gemm::device::GemmGrouped; + typename GemmGrouped::Arguments args(all_problems, num_problems, 512, + epilogue_op, ptr_X, ptr_W, ptr_Y, ptr_Y, + ld_X, ld_W, ld_Y, ld_Y); + + GemmGrouped gemm; + + if (CC >= 80) { + auto status = gemm.initialize(args, nullptr, stream); + if (status != cutlass::Status::kSuccess) { + throw AsException("SgmvCutlass gemm.initialize failed: " + + std::string(cutlassGetStatusString(status)) + "\n"); + } + status = gemm.run(stream); + if (status != cutlass::Status::kSuccess) { + throw AsException("SgmvCutlass gemm.run failed: " + + std::string(cutlassGetStatusString(status)) + "\n"); + } + } else { + throw AsException("Compute capability not supported for SgmvCutlass"); + } + return true; +} + +template bool SgmvCutlass(half* y, const half* x, const half** w, + const int32_t* s, const int32_t* ranks, + void* tmp_d, bool is_k_tensor, bool is_n_tensor, + int num_problems, int d_in, int d_out, + bool unsplit, int unsplit_n, int max_rank, + int CC, cudaStream_t stream); + +template bool SgmvCutlass( + hie::bfloat16* y, const hie::bfloat16* x, const hie::bfloat16** w, + const int32_t* s, const int32_t* ranks, void* tmp_d, bool is_k_tensor, + bool is_n_tensor, int num_problems, int d_in, int d_out, bool unsplit, + int unsplit_n, int max_rank, int CC, cudaStream_t stream); + +template +__global__ void do_split_qkv(T** out_ptrs, T* in, const int32_t* s, + const int32_t* lora_B_ranks, int max_rank) { + int problem_idx = blockIdx.x / 3; + int qkv_idx = blockIdx.x % 3; + int rank = lora_B_ranks[problem_idx]; + T* out = out_ptrs[qkv_idx] + (max_rank / 3) * s[problem_idx * 2]; + T* in_qkv = in + max_rank * s[problem_idx * 2] + qkv_idx * rank; + int batch = s[problem_idx * 2 + 1] - s[problem_idx * 2]; + + for (int i = 0; i < batch; i++) { + for (int j = 0; j < rank; j++) { + *out = *in_qkv; + out++; + in_qkv++; + } + in_qkv += 2 * rank; // skip to next q/k/v + } +} + +template +bool SgmvSplitQKV(DType** out_ptrs, const DType* in, const int32_t* s, + const int32_t* lora_B_ranks, int max_rank, int num_problems, + cudaStream_t stream) { + using cutlass_t = typename cutlass_dtype::type; + do_split_qkv<<>>( + (cutlass_t**)out_ptrs, (cutlass_t*)in, s, lora_B_ranks, max_rank); + return true; +} + +template bool SgmvSplitQKV(float** out_ptrs, const float* in, + const int32_t* s, const int32_t* lora_B_ranks, + int max_rank, int num_problems, + cudaStream_t stream); + +template bool SgmvSplitQKV(half** out_ptrs, const half* in, + const int32_t* s, const int32_t* lora_B_ranks, + int max_rank, int num_problems, + cudaStream_t stream); + +template bool SgmvSplitQKV(hie::bfloat16** out_ptrs, + const hie::bfloat16* in, + const int32_t* s, + const int32_t* lora_B_ranks, + int max_rank, int num_problems, + cudaStream_t stream); + +} // namespace cuda +} // namespace allspark diff --git a/csrc/core/kernel/cuda/validate.cu b/csrc/core/kernel/cuda/validate.cu new file mode 100644 index 00000000..fcd33117 --- /dev/null +++ b/csrc/core/kernel/cuda/validate.cu @@ -0,0 +1,81 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file validate.cu + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cuda_common.h" // NOLINT + +namespace allspark { +namespace cuda_util { + +// 模板函数 ToFloat +template +__device__ float ToFloat(T x); + +// 特化 __half +template <> +__device__ float ToFloat<__half>(__half x) { + return __half2float(x); +} + +// 特化 __nv_bfloat16 +#ifdef ENABLE_BF16 +template <> +__device__ float ToFloat<__nv_bfloat16>(__nv_bfloat16 x) { + return __bfloat162float(x); +} +#endif + +// 检查 inf 和 nan 的函数 +template +void CheckInfNan(const T* d_data, size_t size_in_bytes) { + size_t size = size_in_bytes / 2; + size_t batch_size = 1024 * 1024; + size_t num_batches = (size + batch_size - 1) / batch_size; + for (size_t i = 0; i < num_batches; ++i) { + size_t start = i * batch_size; + size_t end = std::min(start + batch_size, size); + // 分批处理数据 + thrust::device_vector d_vec(d_data + start, d_data + end); + // 使用 thrust::transform 和 thrust::reduce 来检查 inf 和 nan + thrust::device_vector has_inf(end - start); + thrust::device_vector has_nan(end - start); + thrust::transform(d_vec.begin(), d_vec.end(), has_inf.begin(), + [] __device__(T x) { + float f = ToFloat(x); + return isinf(f); + }); + thrust::transform(d_vec.begin(), d_vec.end(), has_nan.begin(), + [] __device__(T x) { + float f = ToFloat(x); + return isnan(f); + }); + bool batch_has_inf = thrust::reduce(has_inf.begin(), has_inf.end(), false, + thrust::logical_or()); + bool batch_has_nan = thrust::reduce(has_nan.begin(), has_nan.end(), false, + thrust::logical_or()); + if (batch_has_inf) throw std::domain_error("INF number exists!"); + if (batch_has_nan) throw std::domain_error("NAN exists!"); + } +} + +// 显式实例化 +template void CheckInfNan<__half>(const __half* d_data, size_t size_in_bytes); + +#ifdef ENABLE_BF16 +template void CheckInfNan<__nv_bfloat16>(const __nv_bfloat16* d_data, + size_t size_in_bytes); +#endif + +} // namespace cuda_util +} // namespace allspark diff --git a/csrc/core/model/model.cpp b/csrc/core/model/model.cpp index 54902584..a9630365 100644 --- a/csrc/core/model/model.cpp +++ b/csrc/core/model/model.cpp @@ -81,7 +81,7 @@ AsModel::AsModel(const std::string& model_type) gen_ctx_ = std::make_unique(); runtime_ctx_ = std::make_unique(); - // pre-alloc enough request space. + // pre-alloc enough request space. all_request_map_.reserve(1000); } @@ -143,12 +143,12 @@ AsStatus AsModel::Init(const TransformerProto& model_proto, AS_CHECK_STATUS( weight_manager_->LoadWeightForModel(ctx, weight_handler_, rankInfo)); // load LoRA - lora_manager_ = - LoraManager::Create(rankInfo); // 每个卡上的AsModel都拥有一批LoRA auto& model_cfg = weight_handler_->GetModelConfig(); - for (auto& lora_name_or_path : model_cfg.lora_names) { - LoadLoraByName(lora_name_or_path); - // lora_manager_->SwapOutWeight(lora_weight_handle, RankInfo()); + lora_manager_ = LoraManager::Create( + model_cfg.lora_max_num, rankInfo); // 每个卡上的AsModel都拥有一批LoRA + if (model_cfg.lora_names.size() > 0) { + LOG(WARNING) + << "Config item 'lora_names' is not any longer supported and ignored!"; } #if ENABLE_SPAN_ATTENTION @@ -169,9 +169,11 @@ AsStatus AsModel::Init(const TransformerProto& model_proto, int num_cache_heads = ctx.GetNumberGroups() > 0 ? ctx.GetNumberGroups() : ctx.GetNumberHeads(); +#ifdef ENABLE_CUDA if (device_type == DeviceType::CUDA) { cache_allocator_ = std::make_shared(ctx_); } +#endif #ifdef CONFIG_CONCURRENT_SPAN if (ctx.GetCacheSpanNumGrow() != 0) { @@ -756,7 +758,7 @@ AsStatus AsModel::StartRequest(std::shared_ptr request) { StopRequest(request->request_id); request->status = AsEngine::GenerateRequestStatus::GenerateFinished; request->finish = true; - return AsStatus::ALLSPARK_LORA_NOT_LOADED; + return AsStatus::ALLSPARK_LORA_NOT_FOUND; } } @@ -1567,9 +1569,15 @@ AsStatus AsModel::Warmup(int64_t bytes_available, int64_t bytes_runtime) { return AsStatus::ALLSPARK_PARAM_ERROR; } - constexpr float runtime_mem_ratio = 1.1; + float runtime_mem_ratio = 1.1; + // sgmv op在load、unload时仍有cuda mem小幅波动,多留些余量以免波动出OOM + // 波动的原因与BFC释放和重新回收mem有关 + if (ctx_->GetLoraEnabled()) { + runtime_mem_ratio = 1.5; + } LOG(INFO) << "warm-up: runtime memory reservation ratio: " << runtime_mem_ratio; + const int64_t bytes_cache = std::max( 0L, bytes_available - static_cast( std::ceil(bytes_runtime * runtime_mem_ratio))); @@ -1730,6 +1738,11 @@ AsStatus AsModel::LoadLoraByName(const std::string& lora_name_or_path) { nullptr); // 必须AsModel::Init()之后才能调用LoadLoraByName AsModelConfig lora_cfg = weight_handler_->GetModelConfig(); // copy cfg from base-model + + if (lora_manager_->GetNumLoras() >= lora_cfg.lora_max_num) { + LOG(ERROR) << "lora number exceeds limit: " << lora_cfg.lora_max_num; + return AsStatus::ALLSPARK_LORA_NUM_EXCEED_LIMIT_ERROR; + } auto lora_path_obj = util::Path(lora_cfg.weights_path); auto lora_dir = lora_path_obj.parent_path(); auto lora_name = lora_name_or_path; @@ -1752,14 +1765,17 @@ AsStatus AsModel::LoadLoraByName(const std::string& lora_name_or_path) { lora_cfg.model_name = lora_name; lora_cfg.weights_path = lora_path; lora_cfg.model_path = ""; // lora不使用该字段, (no graph for lora) - lora_cfg.is_lora = true; + lora_cfg.is_lora_cfg = true; lora_cfg.lora_names.clear(); // lora should NOT have any sub-loras... auto& lora_weight_handle = lora_manager_->RegisterLora(lora_cfg); WeightSwapConfig swap_config; - swap_config.enable = true; + swap_config.enable = + false; // 由调用方来显式load_lora/unload_lora,所以对于lora禁用swap,来提升加载速度 lora_manager_->SetSwapConfig(lora_weight_handle, swap_config); RankInfo rank_info = GetRankInfo(); - lora_manager_->LoadWeightForModel(*ctx_, lora_weight_handle, rank_info); + ret = lora_manager_->LoadWeightForModel(*ctx_, lora_weight_handle, rank_info); + if (ret != AsStatus::ALLSPARK_SUCCESS) + lora_manager_->UnRegisterLora(lora_name); // rollback return ret; } diff --git a/csrc/core/operator/general/gemm/gemm_op_gpu.cpp b/csrc/core/operator/general/gemm/gemm_op_gpu.cpp index 0dfb8c1b..0f762273 100644 --- a/csrc/core/operator/general/gemm/gemm_op_gpu.cpp +++ b/csrc/core/operator/general/gemm/gemm_op_gpu.cpp @@ -42,6 +42,48 @@ AsStatus hgemm_32x128x16_simt_Aldg1_kernel_launcher( return AsStatus::ALLSPARK_SUCCESS; } +AsStatus dense_gemm_rawptr(DataType dtype, void* out, const void* in, + const void* bias, const void* weight, int m, int n, + int k, int lda, int ldb, int ldc, bool transA, + bool transB, int batch, float alpha, + const void* binary_in, UnaryType activation, + const DeviceContext* ctx) { + const CUDAContext* gpu_ctx = static_cast(ctx); + cublasHandle_t cublas_handle = gpu_ctx->GetCublasHandle(); + cudaStream_t cu_stream = gpu_ctx->GetStream(); + if (binary_in != nullptr && bias != nullptr) { + LOG(ERROR) << "binary_in and bias cannot be used at the same time"; + AS_THROW(AsStatus::ALLSPARK_PARAM_ERROR); + } + if (gpu_ctx->GetMatmulPrecision() == PrecisionLevel::HIGH && + dtype == FLOAT32) { + cublasSetMathMode(cublas_handle, CUBLAS_TF32_TENSOR_OP_MATH); + } + auto functor = [&]() { + T* typed_out = static_cast(out); + const T* typed_input = static_cast(in); + const T* typed_bias = static_cast(bias); + const T* typed_weight = static_cast(weight); + const T* typed_binary_in = static_cast(binary_in); + if (batch == 1) { + cuda::GemmWraper(typed_out, typed_input, typed_weight, typed_bias, m, + n, k, transA, transB, lda, ldb, ldc, alpha, 0.0f, + typed_binary_in, cublas_handle, cu_stream); + } else { + cuda::StridedBatchGemmWraper( + typed_out, typed_input, typed_weight, typed_bias, m, n, k, false, + transB, lda, ldb, ldc, alpha, 0.0f, batch, typed_binary_in, + cublas_handle, cu_stream); + } + if (activation != UNARYTYPE_UNDEFINED) { + cuda::UnaryKernelLauncher(typed_out, typed_out, (int64_t)m * n * batch, + activation, cu_stream); + } + }; + DispatchCUDA(dtype, functor); + return AsStatus::ALLSPARK_SUCCESS; +} + AsStatus dense_gemm(DataType dtype, void* out, const void* in, const void* bias, const AsTensor* weight, int m, int n, int k, int lda, int ldb, int ldc, bool transA, bool transB, int batch, diff --git a/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.cpp b/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.cpp index 02d2e659..57935de7 100644 --- a/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.cpp +++ b/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.cpp @@ -20,11 +20,7 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, const TensorMap& weights_map, TensorMap& weights_buffer, TensorMap* tensor_map) { - // DLOG(INFO) << "GemmLoraCapsuleOpGPU::InitV2" << std::endl; - // for easily checking if lora op is available in model - static std::once_flag lora_enabled_print_once; - std::call_once(lora_enabled_print_once, - [] { LOG(INFO) << "lora enabled!" << std::endl; }); + DLOG(INFO) << "GemmLoraCapsuleOpGPU::InitV2" << std::endl; // Capsule's special Init // taken from AsOperator::Init @@ -51,7 +47,8 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, // taken from AsOperator::Init END std::string act_str = ""; - OperatorProto mutable_op_proto = op_proto; + OperatorProto mutable_op_proto; + mutable_op_proto.CopyFrom(op_proto); const auto& orig_attr_map = op_proto.attr(); if (orig_attr_map.count("InnerGemmType")) { @@ -59,6 +56,8 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, } // 组装Lora op_list + std::string op_name_pattern = + util::StringUtil::RemoveLayerNumber(op_proto.op_name()); // base auto op = OpFactory::getInstance().GetOperator( {inner_gemm_type_, ctx.GetDeviceType()})(); @@ -66,7 +65,9 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, auto& attr_map = *mutable_op_proto.mutable_attr(); auto input = mutable_op_proto.mutable_inputs(0); auto output = mutable_op_proto.mutable_outputs(0); - base_out_name_ = output->name(); + capsule_out_name_ = std::string(output->name()); + base_out_name_ = op_name_pattern + ".base_gemm.out"; + output->set_name(base_out_name_); if (attr_map.count("activation")) { activation_ = *(UnaryType*)(attr_map.at("activation").c_str()); act_str = attr_map.at("activation"); @@ -76,6 +77,7 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, rank_info_, tensor_map, profiler_); lora_op_list_.emplace_back(std::move(op)); +#if 0 // 老算子 // Lora_A op = OpFactory::getInstance().GetOperator({"GemmLora", ctx.GetDeviceType()})(); @@ -88,7 +90,7 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, } auto weight = mutable_op_proto.mutable_weights(0); weight->set_name(mutable_op_proto.op_name() + ".weight"); - output->set_name(mutable_op_proto.op_name() + ".out"); + output->set_name(op_name_pattern + ".lora_A.out"); attr_map.erase("alpha"); // loraA 没有伸缩 op->CallInit(mutable_op_proto, ctx, weight_manager_, weight_handler_, lora_manager_, rank_info_, tensor_map, profiler_); @@ -101,11 +103,36 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, mutable_op_proto.set_op_name(op_proto.op_name() + ".lora_B"); input->set_name(output->name()); weight->set_name(mutable_op_proto.op_name() + ".weight"); - output->set_name(mutable_op_proto.op_name() + ".out"); + output->set_name(op_name_pattern + ".lora_B.out"); attr_map.erase("alpha"); // 优化aslora后,使用默认1.0 op->CallInit(mutable_op_proto, ctx, weight_manager_, weight_handler_, lora_manager_, rank_info_, tensor_map, profiler_); lora_op_list_.emplace_back(std::move(op)); +#endif + +#if 1 + // 采用新算子 + op = + OpFactory::getInstance().GetOperator({"SgmvLora", ctx.GetDeviceType()})(); + mutable_op_proto.set_op_type("SgmvLora"); + mutable_op_proto.set_op_name(op_proto.op_name() + ".sgmv"); + auto weights = mutable_op_proto.mutable_weights(); + if (weights->size() == 1) { + // sgmv需要lora_A、lora_B两个weight + TensorProto weight2; + weight2.set_name(op_proto.op_name()); // basic name + weight2.set_data(""); + mutable_op_proto.mutable_weights()->AddAllocated(new TensorProto(weight2)); + } + mutable_op_proto.mutable_weights(0)->set_name(op_proto.op_name() + + ".lora_A.weight"); + mutable_op_proto.mutable_weights(1)->set_name(op_proto.op_name() + + ".lora_B.weight"); + output->set_name(op_name_pattern + ".sgmv.out"); + op->CallInit(mutable_op_proto, ctx, weight_manager_, weight_handler_, + lora_manager_, rank_info_, tensor_map, profiler_); + lora_op_list_.emplace_back(std::move(op)); +#endif // Binary-Add op = OpFactory::getInstance().GetOperator({"Binary", ctx.GetDeviceType()})(); @@ -116,10 +143,9 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, input2.set_name(base_out_name_); input2.set_data(""); mutable_op_proto.mutable_inputs()->AddAllocated(new TensorProto(input2)); - auto output_name = activation_ == UNARYTYPE_UNDEFINED - ? op_proto.op_name() - : mutable_op_proto.op_name(); - bin_add_out_name_ = output_name + ".out"; + bin_add_out_name_ = activation_ == UNARYTYPE_UNDEFINED + ? capsule_out_name_ + : op_name_pattern + ".base_add_lora.out"; output->set_name(bin_add_out_name_); mutable_op_proto.clear_weights(); attr_map.clear(); @@ -138,8 +164,8 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, mutable_op_proto.mutable_inputs()->DeleteSubrange( 1, mutable_op_proto.mutable_inputs()->size() - 1); // 恢复成1个输入 - input->set_name(output->name()); - output->set_name(op_proto.op_name() + ".out"); + input->set_name(bin_add_out_name_); + output->set_name(capsule_out_name_); attr_map.clear(); attr_map["unary_type"] = act_str; op->CallInit(mutable_op_proto, ctx, weight_manager_, weight_handler_, @@ -151,12 +177,23 @@ AsStatus GemmLoraCapsuleOpGPU::InitV2(const OperatorProto& op_proto, } void GemmLoraCapsuleOpGPU::SwitchLoraGraph(bool use_std_gemm_graph) { - if (use_std_gemm_graph) { - if (activation_ != UNARYTYPE_UNDEFINED) + if (activation_ == UNARYTYPE_UNDEFINED) { // 无act + if (use_std_gemm_graph) { // 无lora 标准gemm,没有act: + // 最后一个op是bin_add. 只call 标准gemm + // base_gemm 输出到最终结果。 + lora_op_list_.front()->UpdateOutName(0, capsule_out_name_); + } else { // lora:最后一个op是bin_add + // base_gemm 输出到base_out_name + lora_op_list_.front()->UpdateOutName(0, base_out_name_); + } + } else { // 有act + if (use_std_gemm_graph) { // 无lora,最后一个op是act,会被call + // act 从base_out_name 获取输入, (输出一定是到最终结果) lora_op_list_.back()->UpdateInName(0, base_out_name_); - } else { - if (activation_ != UNARYTYPE_UNDEFINED) + } else { // lora:最后一个op是act + // act从bin_add_out_name 获取输入(act输出一定是到最终结果) lora_op_list_.back()->UpdateInName(0, bin_add_out_name_); + } } } @@ -169,8 +206,9 @@ AsStatus GemmLoraCapsuleOpGPU::Reshape(RuntimeContext* runtime_ctx) { SwitchLoraGraph(true); // TODO: use unified graph, ditto AS_CHECK_STATUS(lora_op_list_.front()->CallReshape(runtime_ctx)); - if (activation_ != UNARYTYPE_UNDEFINED) + if (activation_ != UNARYTYPE_UNDEFINED) { AS_CHECK_STATUS(lora_op_list_.back()->CallReshape(runtime_ctx)); + } return AsStatus::ALLSPARK_SUCCESS; } @@ -186,7 +224,6 @@ AsStatus GemmLoraCapsuleOpGPU::Reshape(RuntimeContext* runtime_ctx) { } } if (!lora_name.empty()) { - DLOG(INFO) << "lora=" << lora_name << std::endl; has_lora_in_batch_ = true; } @@ -207,14 +244,17 @@ AsStatus GemmLoraCapsuleOpGPU::Reshape(RuntimeContext* runtime_ctx) { } AsStatus GemmLoraCapsuleOpGPU::Forward(RuntimeContext* runtime_ctx) { - // DLOG(INFO) << "GemmLoraCapsuleOpGPU::Forward" << std::endl; + DLOG(INFO) << "GemmLoraCapsuleOpGPU::Forward" << std::endl; AsStatus ret = AsStatus::ALLSPARK_SUCCESS; if (lora_manager_->IsEmpty()) { SwitchLoraGraph(true); AS_CHECK_STATUS(lora_op_list_.front()->CallForward(runtime_ctx)); - if (activation_ != UNARYTYPE_UNDEFINED) + // lora_op_list_.front()->PrintInformation(); + if (activation_ != UNARYTYPE_UNDEFINED) { AS_CHECK_STATUS(lora_op_list_.back()->CallForward(runtime_ctx)); + // lora_op_list_.back()->PrintInformation(); + } return AsStatus::ALLSPARK_SUCCESS; } @@ -223,15 +263,18 @@ AsStatus GemmLoraCapsuleOpGPU::Forward(RuntimeContext* runtime_ctx) { for (auto& op : lora_op_list_) { ret = op->CallForward(runtime_ctx); // op->PrintInformation(); - // DO_ARBITRATE(0, 1, 0, op); + // DO_ARBITRATE(0, 1, 0, op); AS_CHECK_STATUS(ret); } return ret; } else { SwitchLoraGraph(true); AS_CHECK_STATUS(lora_op_list_.front()->CallForward(runtime_ctx)); - if (activation_ != UNARYTYPE_UNDEFINED) + // lora_op_list_.front()->PrintInformation(); + if (activation_ != UNARYTYPE_UNDEFINED) { AS_CHECK_STATUS(lora_op_list_.back()->CallForward(runtime_ctx)); + // lora_op_list_.back()->PrintInformation(); + } return AsStatus::ALLSPARK_SUCCESS; } } diff --git a/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.h b/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.h index ee7c2a6a..375e11f4 100644 --- a/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.h +++ b/csrc/core/operator/general/gemm_lora/gemm_capsule_op_gpu.h @@ -26,7 +26,7 @@ class GemmLoraCapsuleOpGPU : public AsOperator { std::unique_ptr std_gemm_op_; std::vector> lora_op_list_; std::string inner_gemm_type_ = "Gemm"; // for quant - std::string base_out_name_, bin_add_out_name_; + std::string capsule_out_name_, base_out_name_, bin_add_out_name_; UnaryType activation_ = UnaryType::UNARYTYPE_UNDEFINED; }; } // namespace allspark diff --git a/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.cpp b/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.cpp index cd9dde9a..b82113a2 100644 --- a/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.cpp +++ b/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.cpp @@ -32,6 +32,14 @@ AsStatus GemmLoraOpGPU::InitV2(const OperatorProto& op_proto, op_proto_.CopyFrom(op_proto); AS_CHECK_STATUS(GemmOpBase::InitV2(op_proto, ctx, weights_map, weights_buffer, tensor_map)); + + // 计算QKV相关维度信息 + int nslice = ctx.GetNranks(); + assert(ctx.GetNumberHeads() * ctx.GetSizePerHead() % nslice == 0); + assert(ctx.GetNumberGroups() * ctx.GetSizePerHead() % nslice == 0); + q_outdim_size_ = ctx.GetNumberHeads() * ctx.GetSizePerHead() / nslice; + k_outdim_size_ = ctx.GetNumberGroups() * ctx.GetSizePerHead() / nslice; + v_outdim_size_ = k_outdim_size_; return AsStatus::ALLSPARK_SUCCESS; } @@ -59,13 +67,6 @@ AsStatus GemmLoraOpGPU::Reshape(RuntimeContext* runtime_ctx) { auto lora_weight_handle = lora_manager_->GetHandleByName(lora_name); DLOG(INFO) << "done lora_manager_->GetHandleByName lora " << lora_name << std::endl; - SwapStatus swap_status = lora_manager_->GetSwapStatus(lora_weight_handle); - if (swap_status == SwapStatus::SwapOut) { - DLOG(INFO) << " lora " << lora_name - << " has been swapped out, prepare to swap in" << std::endl; - lora_manager_->SwapInWeight(lora_weight_handle, rank_info_); - DLOG(INFO) << " lora " << lora_name << " swapped in" << std::endl; - } auto& t_name = weight_names_[0]; auto weight_tensor_p = lora_manager_->GetLoraTensorByName(lora_name, t_name); @@ -88,10 +89,6 @@ AsStatus GemmLoraOpGPU::Reshape(RuntimeContext* runtime_ctx) { assert(lora_manager_); // assumes that key-names are identical between different LoRAs auto lora_weight_handle = lora_manager_->GetHandleByName(lora_name_of_max_r); - SwapStatus swap_status = lora_manager_->GetSwapStatus(lora_weight_handle); - if (swap_status == SwapStatus::SwapOut) { - lora_manager_->SwapInWeight(lora_weight_handle, rank_info_); - } weights_.clear(); auto& t_name = weight_names_[0]; DLOG(INFO) << "It's a real lora weight for lora " << lora_name_of_max_r @@ -140,8 +137,10 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { auto batch_out_stride = batch_out_tensor->GetShape().Count(1) * SizeofType(batch_out_tensor->GetDataType()); auto out_size_per_batch = batch_out_tensor->GetSizeInByte() / input_batchsize; + + auto in_bytes_per_data = SizeofType(batch_in_tensor->GetDataType()); for (auto i = 0; i < input_batchsize; i++) { - void* in_ptr = (char*)batch_in_tensor->GetDataPtr() + i * batch_in_stride; + char* in_ptr = (char*)batch_in_tensor->GetDataPtr() + i * batch_in_stride; void* out_ptr = (char*)batch_out_tensor->GetDataPtr() + i * batch_out_stride; // GemmLora 不使用weights_, 使用lora_weight @@ -196,8 +195,8 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { iter++; } - // TODO: 支持GROUP_VSPLIT - assert(shape_w[1] % 3 == 0); + // 已支持GROUP_VSPLIT + assert(shape_w[1] == q_outdim_size_ + k_outdim_size_ + v_outdim_size_); assert(shape_w[1] == max_shape_out[2]); auto lora_r = shape_w[0]; // shape_in[2] is not accurate (due to padding for LoRAs with different r @@ -207,10 +206,15 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { // 复原成qkv 3个输入 auto m = shape_in[1]; auto k = lora_r; - auto n = shape_w[1] / 3; - auto lda = k; - auto ldb = transB_ ? k : n; - auto ldc = n; + using QKV_DimType = dim_t[3]; + QKV_DimType n = {q_outdim_size_, k_outdim_size_, + v_outdim_size_}; // 支持MHA、QGA、MQA + QKV_DimType n_prefix_sum = {0, q_outdim_size_, + q_outdim_size_ + k_outdim_size_}; + auto lda = k * 3; + QKV_DimType ldb = {n[0], n[1], n[2]}; + if (transB_) ldb[0] = ldb[1] = ldb[2] = k; + QKV_DimType& ldc = n; std::vector> output_parts; for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { @@ -219,9 +223,9 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { weight_t_part = std::make_shared( lora_weight->GetName() + "." + std::to_string(qkv_idx), lora_weight->GetDeviceType(), lora_weight->GetDataType(), - lora_weight->GetDataMode(), Shape{k, n}); + lora_weight->GetDataMode(), Shape{k, n[qkv_idx]}); TensorUtils::DeepCopyMatrix2D(*weight_t_part, *lora_weight, - qkv_idx * n, 0, ctx_); + n_prefix_sum[qkv_idx], 0, ctx_); qkv_weight_cache_[std::make_pair(lora_name, qkv_idx)] = weight_t_part; } weight_t_part = qkv_weight_cache_.at({lora_name, qkv_idx}); @@ -232,9 +236,9 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { bias_t_part = std::make_shared( lora_bias->GetName() + "." + std::to_string(qkv_idx), lora_bias->GetDeviceType(), lora_bias->GetDataType(), - lora_bias->GetDataMode(), Shape{n}); - TensorUtils::DeepCopyMatrix2D(*bias_t_part, *lora_bias, qkv_idx * n, - 0, ctx_); + lora_bias->GetDataMode(), Shape{n[qkv_idx]}); + TensorUtils::DeepCopyMatrix2D(*bias_t_part, *lora_bias, + n_prefix_sum[qkv_idx], 0, ctx_); qkv_bias_cache_[std::make_pair(lora_name, qkv_idx)] = bias_t_part; } bias_t_part = qkv_bias_cache_.at({lora_name, qkv_idx}); @@ -243,18 +247,19 @@ AsStatus GemmLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { auto out_t_part = std::make_shared( batch_out_tensor->GetName() + "." + std::to_string(qkv_idx), batch_out_tensor->GetDeviceType(), batch_out_tensor->GetDataType(), - batch_out_tensor->GetDataMode(), Shape{m, n}); + batch_out_tensor->GetDataMode(), Shape{m, n[qkv_idx]}); if (!use_quant_) { // fp16, fp32 DLOG(INFO) << lora_weight_name << " alpha_=" << alpha_ << std::endl; - kernel_launcher(batch_in_tensor->GetDataType(), - out_t_part->GetDataPtr(), in_ptr, bias, - weight_t_part.get(), m, n, k, lda, ldb, ldc, false, - transB_, 1, alpha_ /* aka. lora_scaling */, nullptr, - UNARYTYPE_UNDEFINED, ctx_); + kernel_launcher( + batch_in_tensor->GetDataType(), out_t_part->GetDataPtr(), in_ptr, + bias, weight_t_part.get(), m, n[qkv_idx], k, lda, ldb[qkv_idx], + ldc[qkv_idx], false, transB_, 1, alpha_ /* aka. lora_scaling */, + nullptr, UNARYTYPE_UNDEFINED, ctx_); } else { // use quantization throw AsException("lora quant not implemented"); } output_parts.emplace_back(out_t_part); + in_ptr += lora_r * in_bytes_per_data; } TensorUtils::ConcatMatrix2DColWise(*batch_out_tensor, i, output_parts, diff --git a/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.h b/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.h index d479dac8..a6c659df 100644 --- a/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.h +++ b/csrc/core/operator/general/gemm_lora/gemm_lora_op_gpu.h @@ -37,15 +37,7 @@ class GemmLoraOpGPU : public GemmOpGPU { OperatorProto quant_op_proto_; // quant END - /* - int64_t lora_m_; - int64_t lora_n_; - int64_t lora_k_; - int64_t lora_batch_; - int lora_lda_; - int lora_ldb_; - int lora_ldc_; - */ + dim_t q_outdim_size_{0}, k_outdim_size_{0}, v_outdim_size_{0}; }; } // namespace allspark #endif diff --git a/csrc/core/operator/general/gemm_lowp/gemm_a16w8_gpu.cpp b/csrc/core/operator/general/gemm_lowp/gemm_a16w8_gpu.cpp index 8e4c869e..791d9ef7 100644 --- a/csrc/core/operator/general/gemm_lowp/gemm_a16w8_gpu.cpp +++ b/csrc/core/operator/general/gemm_lowp/gemm_a16w8_gpu.cpp @@ -98,7 +98,7 @@ void GemmA16W8GPU::GetWeightPaddedDispatch(const DataType ftype, get_weight_padded_n_align(weights_buffer); is_npad_ = true; } - // Special case: if sm8x and group_size_ = -1, + // Special case: if sm8x and group_size_ = -1 // reorder B as N32K16 order for use with Ampere_A16W8_GEMM_PERC_16816 // kernel if (sm_version_ >= 0x0800 && group_size_ == -1) { @@ -123,7 +123,7 @@ void GemmA16W8GPU::GetWeightPaddedDispatch(const DataType ftype, get_weight_padded_n_align(weights_buffer); is_npad_ = true; } - // Special case: if sm8x and group_size_ = -1, + // Special case: if sm8x and group_size_ = -1 // reorder B as N32K16 order for use with Ampere_A16W8_GEMM_PERC_16816 // kernel if (sm_version_ >= 0x0800 && group_size_ == -1) { diff --git a/csrc/core/operator/general/gemm_lowp/gemm_a8w8_gpu.cpp b/csrc/core/operator/general/gemm_lowp/gemm_a8w8_gpu.cpp index 143365b9..2e6cb328 100644 --- a/csrc/core/operator/general/gemm_lowp/gemm_a8w8_gpu.cpp +++ b/csrc/core/operator/general/gemm_lowp/gemm_a8w8_gpu.cpp @@ -72,7 +72,7 @@ void GemmA8W8GPU::GetWeightPaddedDispatch(const DataType ftype, get_weight_padded_n_align(weights_buffer); is_npad_ = true; } - // Special case: if sm8x and group_size_ = -1, + // Special case: if sm8x and group_size_ = -1 // reorder B as N32K16 order for use with Ampere_A16W8_GEMM_PERC_16816 // kernel if (sm_version_ >= 0x0800 && group_size_ == -1) { @@ -97,7 +97,7 @@ void GemmA8W8GPU::GetWeightPaddedDispatch(const DataType ftype, get_weight_padded_n_align(weights_buffer); is_npad_ = true; } - // Special case: if sm8x and group_size_ = -1, + // Special case: if sm8x and group_size_ = -1 // reorder B as N32K16 order for use with Ampere_A16W8_GEMM_PERC_16816 // kernel if (sm_version_ >= 0x0800 && group_size_ == -1) { diff --git a/csrc/core/operator/general/moe/moe_op.cpp b/csrc/core/operator/general/moe/moe_op.cpp index 16017dcd..37c0e96b 100644 --- a/csrc/core/operator/general/moe/moe_op.cpp +++ b/csrc/core/operator/general/moe/moe_op.cpp @@ -12,6 +12,19 @@ #endif #include +int64_t get_max_block(int input_token, int num_expert, int num_expert_pertoken, + int block_size) { + int64_t max_token = (int64_t)input_token * num_expert_pertoken; + if (max_token < num_expert) { + // max_block = max_token + return max_token; + } + int64_t max_block = num_expert + (max_token) / block_size; + return max_block; +} +size_t aligned_size(size_t n, size_t aligned = 128) { + return (n + aligned - 1) / aligned * aligned; +} namespace allspark { AsStatus MoeOp::Init(const OperatorProto& op_proto, const DeviceContext& ctx, const TensorMap& weights_map, TensorMap* tensor_map) { @@ -19,6 +32,32 @@ AsStatus MoeOp::Init(const OperatorProto& op_proto, const DeviceContext& ctx, // type inference dtype_ = tensor_map_->at(in_names_[0])->GetDataType(); DeviceType backend = ctx.GetDeviceType(); + switch (backend) { +#ifdef ENABLE_CUDA + case DeviceType::CUDA: { + int device_id; + AS_CHECK_CUDA(cudaGetDevice(&device_id)); + AS_CHECK_CUDA(cudaGetDeviceProperties(&dprop_, device_id)); + int sm_version = dprop_.major << 8 | dprop_.minor; + if (sm_version >= 0x0900) { + use_dnn_ = true; + } else { + use_dnn_ = false; + } + break; + } +#endif + case DeviceType::CPU: { + LOG(ERROR) << "MOE Operator does not support " + << "CPU" + << " device type" << std::endl; + return AsStatus::ALLSPARK_RUNTIME_ERROR; + } + default: + LOG(ERROR) << "MOE Operator does not support" << DeviceType_Name(backend) + << " device type" << std::endl; + return AsStatus::ALLSPARK_RUNTIME_ERROR; + } tensor_map_->at(out_names_[0])->SetDataType(dtype_); // attr auto& attr_map = op_proto.attr(); @@ -34,30 +73,91 @@ AsStatus MoeOp::Init(const OperatorProto& op_proto, const DeviceContext& ctx, } num_expert_pertoken_ = *(int*)(attr_map.at("num_experts_per_tok").c_str()); first_moe_ = true; - // default - float_gate_score_ = std::make_unique( - "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_}); - topk_value_ = std::make_unique( - "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_}); - experts_score_ = std::make_unique( - "experts_score_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - topk_indice_ = std::make_unique( - "topk_indice_", backend, DataType::INT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - mid_row_indices_ = std::make_unique( - "mid_row_indices_", backend, DataType::INT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - mid_expert_indices_ = std::make_unique( - "mid_expert_indices_", backend, DataType::INT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - final_row_indices_ = std::make_unique( - "final_row_indices_", backend, DataType::INT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + hidden_size_ = weights_[0]->GetShape()[1]; proj_size_ = weights_[0]->GetShape()[2] / 2; + total_token_ = ctx_->GetModelMaxLength(); + // default + if (use_dnn_) { + float_gate_score_ = std::make_unique( + "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_}); + topk_value_ = std::make_unique( + "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_}); + experts_score_ = std::make_unique( + "experts_score_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + topk_indice_ = std::make_unique( + "topk_indice_", backend, DataType::INT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + mid_row_indices_ = std::make_unique( + "mid_row_indices_", backend, DataType::INT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + mid_expert_indices_ = std::make_unique( + "mid_expert_indices_", backend, DataType::INT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + final_row_indices_ = std::make_unique( + "final_row_indices_", backend, DataType::INT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + +#ifdef ENABLE_CUDA + if (ctx_->GetDeviceType() == DeviceType::CUDA) { + cuda::GetWorkspaceSize(&hWsSize, &dWsSize, + total_token_ * num_expert_pertoken_, num_expert_); + cudaMallocHost(&hWs, hWsSize); + } +#endif + } else { + block_size_ = 64; + float_gate_score_ = std::make_unique( + "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_}); + topk_value_ = std::make_unique( + "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_}); + experts_score_ = std::make_unique( + "experts_score_", backend, DataType::FLOAT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + topk_indice_ = std::make_unique( + "topk_indice_", backend, DataType::INT32, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + experts_idx_ = std::make_unique( + "experts_idx_", backend, DataType::INT64, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + experts_seq_ = std::make_unique( + "experts_seq_", backend, DataType::INT64, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + indice_source_ = std::make_unique( + "indice_source_", backend, DataType::INT64, DataMode::DENSE, + Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); + total_tokens_post_pad_ = + std::make_unique("total_tokens_post_pad_", backend, + DataType::INT32, DataMode::DENSE, Shape{1}); + + int64_t max_block = get_max_block(ctx_->GetModelMaxLength(), num_expert_, + num_expert_pertoken_, block_size_); + gate_up_proj_array_ptr = std::make_unique( + "gate_up_proj_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + down_proj_array_ptr = std::make_unique( + "down_proj_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + reorder_data_array_ptr = std::make_unique( + "reorder_data_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + gate_up_proj_out_array_ptr = std::make_unique( + "gate_up_proj_out_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + mid_result_array_ptr = std::make_unique( + "mid_result_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + final_result_array_ptr = std::make_unique( + "final_result_array_ptr", backend, DataType::INT64, DataMode::DENSE, + Shape{max_block}); + std::unique_ptr experts_num; + std::unique_ptr experts_seq; + } return AsStatus::ALLSPARK_SUCCESS; } AsStatus MoeOp::Reshape() { @@ -69,38 +169,48 @@ AsStatus MoeOp::Reshape() { // only reshape once,for warmup first_moe_ = false; total_token_ = ctx_->GetModelMaxLength(); - int64_t max_total_tokens = total_token_ * num_expert_pertoken_; - expert_size_ = (int64_t)hidden_size_ * proj_size_; - float_gate_score_->SetShape(Shape{total_token_, num_expert_}); - topk_value_->SetShape(Shape{total_token_, num_expert_}); - experts_score_->SetShape(Shape{total_token_, num_expert_pertoken_}); - topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); + int64_t max_total_tokens = 0; + if (use_dnn_) { + max_total_tokens = total_token_ * num_expert_pertoken_; + float_gate_score_->SetShape(Shape{total_token_, num_expert_}); + topk_value_->SetShape(Shape{total_token_, num_expert_}); + experts_score_->SetShape(Shape{total_token_, num_expert_pertoken_}); + topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); + } else { + int64_t max_block = get_max_block(total_token_, num_expert_, + num_expert_pertoken_, block_size_); + max_total_tokens = max_block * block_size_; + expert_size_ = (int64_t)hidden_size_ * proj_size_; + float_gate_score_->SetShape(Shape{total_token_, num_expert_}); + topk_value_->SetShape(Shape{total_token_, num_expert_}); + experts_score_->SetShape(Shape{total_token_, num_expert_pertoken_}); + topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); + experts_idx_->SetShape(Shape{max_block}); + experts_seq_->SetShape(Shape{max_total_tokens}); + indice_source_->SetShape(Shape{max_total_tokens}); + } ws_size_ = 0; - -#ifdef ENABLE_CUDA - if (ctx_->GetDeviceType() == DeviceType::CUDA) { - size_t softmax_workspace = 0; - cuda::StridedSoftmaxGetWorkspaceSize( - &softmax_workspace, ctx_->GetModelMaxLength(), num_expert_); - AS_CHECK_STATUS( - tensor_map_->at("workspace") - ->SetShape(Shape{static_cast(softmax_workspace)})); - ws_size_ += softmax_workspace; + if (use_dnn_) { + ws_size_ += aligned_size(max_total_tokens * proj_size_ * 2 * + SizeofType(dtype_)); // up_gate_proj_out + ws_size_ += aligned_size(max_total_tokens * proj_size_ * + SizeofType(dtype_)); // mid_result + ws_size_ += aligned_size(max_total_tokens * hidden_size_ * + SizeofType(dtype_)); // final_result + ws_size_ += aligned_size(dWsSize); + } else { + ws_size_ += + max_total_tokens * hidden_size_ * SizeofType(dtype_); // reorder_data + ws_size_ += + max_total_tokens * proj_size_ * SizeofType(dtype_); // up_proj_out + ws_size_ += + max_total_tokens * proj_size_ * SizeofType(dtype_); // gate_out + ws_size_ += + max_total_tokens * proj_size_ * SizeofType(dtype_); // mid_result + ws_size_ += + max_total_tokens * hidden_size_ * SizeofType(dtype_); // final_result } -#endif - ws_size_ += max_total_tokens * proj_size_ * 2 * - SizeofType(dtype_); // up_gate_proj_out - ws_size_ += - max_total_tokens * proj_size_ * SizeofType(dtype_); // mid_result - ws_size_ += - max_total_tokens * hidden_size_ * SizeofType(dtype_); // final_result AS_CHECK_STATUS(tensor_map_->at("workspace")->SetShape(Shape{(ws_size_)})); - AsTensor* in_tensor = tensor_map_->at(in_names_[0]).get(); - AsTensor* expert_weight_tensor = tensor_map_->at(in_names_[1]).get(); - AsTensor* gate_up_proj_weight_tensor = weights_[0]; - AsTensor* down_proj_weight_tensor = weights_[1]; - AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get(); - switch (ctx_->GetDeviceType()) { #ifdef ENABLE_CUDA case DeviceType::CUDA: { @@ -109,20 +219,18 @@ AsStatus MoeOp::Reshape() { cublasHandle_t cublas_handle = gpu_ctx->GetCublasHandle(); cudaStream_t cu_stream = static_cast(ctx_)->GetStream(); - cuda::GetWorkspaceSize(&hWsSize, &dWsSize, total_token_, num_expert_); - cudaMallocHost(&hWs, hWsSize); - AS_CHECK_STATUS( - tensor_map_->at("workspace")->SetShape(Shape{(dWsSize)})); auto functor = [&]() { - void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); - reorder_data = ws_ptr; - gate_up_proj_out = (char*)reorder_data + max_total_tokens * - hidden_size_ * - SizeofType(dtype_); - mid_result = (char*)gate_up_proj_out + - max_total_tokens * proj_size_ * 2 * SizeofType(dtype_); - final_result = (char*)mid_result + - max_total_tokens * proj_size_ * SizeofType(dtype_); + void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); + gate_up_proj_out = (char*)ws_ptr; + mid_result = (char*)gate_up_proj_out + + aligned_size(max_total_tokens * proj_size_ * 2 * + SizeofType(dtype_)); + final_result = + (char*)mid_result + aligned_size(max_total_tokens * proj_size_ * + SizeofType(dtype_)); + dnn_ws = (char*)final_result + + aligned_size(max_total_tokens * hidden_size_ * + SizeofType(dtype_)); }; DispatchCUDA(dtype_, functor); break; @@ -139,37 +247,21 @@ AsStatus MoeOp::Reshape() { } } total_token_ = out_shape[0] * out_shape[1]; - topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); - mid_row_indices_->SetShape(Shape{total_token_, num_expert_pertoken_}); - mid_expert_indices_->SetShape(Shape{total_token_ * num_expert_pertoken_, 1}); - final_row_indices_->SetShape(Shape{total_token_ * num_expert_pertoken_, 1}); - return AsStatus::ALLSPARK_SUCCESS; -} -#if 0 -// dubug code -static void print_info(void* input, const DeviceContext* ctx, - size_t layout_size = 0) { - const int print_count = 10; - cudaStream_t cu_stream = static_cast(ctx)->GetStream(); - std::vector host_out(print_count); - cudaMemcpyAsync(host_out.data(), input, print_count, cudaMemcpyDeviceToHost, - cu_stream); - ctx->Synchronize(); - void* data_ptr = host_out.data(); - half* ptr = static_cast(data_ptr); - for (int i = 0; i < print_count; i++) { - LOG(INFO) << (float)(ptr[i]) << ","; + if (use_dnn_) { + topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); + mid_row_indices_->SetShape(Shape{total_token_, num_expert_pertoken_}); + mid_expert_indices_->SetShape( + Shape{total_token_ * num_expert_pertoken_, 1}); + final_row_indices_->SetShape(Shape{total_token_ * num_expert_pertoken_, 1}); } - LOG(INFO) << std::endl; + return AsStatus::ALLSPARK_SUCCESS; } -#endif AsStatus MoeOp::Forward() { AsTensor* in_tensor = tensor_map_->at(in_names_[0]).get(); AsTensor* expert_weight_tensor = tensor_map_->at(in_names_[1]).get(); AsTensor* gate_up_proj_weight_tensor = weights_[0]; AsTensor* down_proj_weight_tensor = weights_[1]; AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get(); - void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); switch (ctx_->GetDeviceType()) { #ifdef ENABLE_CUDA case DeviceType::CUDA: { @@ -183,10 +275,6 @@ AsStatus MoeOp::Forward() { (float*)float_gate_score_->GetDataPtr(), expert_weight_tensor->GetShape().Count(), cu_stream); - // cuda::StridedSoftmaxLauncher((float*)topk_value_->GetDataPtr(), - // (float*)float_gate_score_->GetDataPtr(), - // nullptr, nullptr, ws_ptr, ws_size_, - // total_token_, num_expert_, cu_stream); cuda::SoftmaxLowReduceKernelLauncher( (float*)float_gate_score_->GetDataPtr(), (float*)topk_value_->GetDataPtr(), total_token_, num_expert_, @@ -195,35 +283,35 @@ AsStatus MoeOp::Forward() { (int*)topk_indice_->GetDataPtr(), (float*)topk_value_->GetDataPtr(), total_token_, num_expert_, top_k, cu_stream); - cuda::MoeBatchedGemmLauncher( - (T*)in_tensor->GetDataPtr(), - (T*)gate_up_proj_weight_tensor->GetDataPtr(), - (uint32_t*)topk_indice_->GetDataPtr(), (T*)gate_up_proj_out, - (uint32_t*)mid_row_indices_->GetDataPtr(), hWs, hWsSize, ws_ptr, - dWsSize, total_token_, proj_size_ * 2, hidden_size_, num_expert_, - top_k, cu_stream); + cuda::MoeBatchedGemmLauncher( + (T*)in_tensor->GetDataPtr(), + (T*)gate_up_proj_weight_tensor->GetDataPtr(), + (uint32_t*)topk_indice_->GetDataPtr(), (T*)gate_up_proj_out, + (uint32_t*)mid_row_indices_->GetDataPtr(), hWs, hWsSize, dnn_ws, + dWsSize, total_token_, proj_size_ * 2, hidden_size_, num_expert_, + top_k, cu_stream); - cuda::UnaryGLUKernelLauncher((T*)mid_result, (T*)gate_up_proj_out, - total_token_ * top_k, proj_size_, - UnaryType::SILU, cu_stream); + cuda::UnaryGLUKernelLauncher((T*)mid_result, (T*)gate_up_proj_out, + total_token_ * top_k, proj_size_, + UnaryType::SILU, cu_stream); - cuda::GetExpertByIndice((int*)mid_expert_indices_->GetDataPtr(), - (int*)topk_indice_->GetDataPtr(), - (int*)mid_row_indices_->GetDataPtr(), - total_token_, top_k, num_expert_, cu_stream); + cuda::GetExpertByIndice((int*)mid_expert_indices_->GetDataPtr(), + (int*)topk_indice_->GetDataPtr(), + (int*)mid_row_indices_->GetDataPtr(), + total_token_, top_k, num_expert_, cu_stream); - cuda::MoeBatchedGemmLauncher( - (T*)mid_result, (T*)down_proj_weight_tensor->GetDataPtr(), - (uint32_t*)mid_expert_indices_->GetDataPtr(), (T*)final_result, - (uint32_t*)final_row_indices_->GetDataPtr(), hWs, hWsSize, ws_ptr, - dWsSize, total_token_ * top_k, hidden_size_, proj_size_, - num_expert_, 1, cu_stream); - cuda::FinalizeMoeRoutingNewKernelLauncher( - (T*)out_tensor->GetDataPtr(), (T*)final_result, - (float*)experts_score_->GetDataPtr(), - (int*)mid_row_indices_->GetDataPtr(), - (int*)final_row_indices_->GetDataPtr(), total_token_, top_k, - hidden_size_, cu_stream); + cuda::MoeBatchedGemmLauncher( + (T*)mid_result, (T*)down_proj_weight_tensor->GetDataPtr(), + (uint32_t*)mid_expert_indices_->GetDataPtr(), (T*)final_result, + (uint32_t*)final_row_indices_->GetDataPtr(), hWs, hWsSize, dnn_ws, + dWsSize, total_token_ * top_k, hidden_size_, proj_size_, + num_expert_, 1, cu_stream); + cuda::FinalizeMoeRoutingNewKernelLauncher( + (T*)out_tensor->GetDataPtr(), (T*)final_result, + (float*)experts_score_->GetDataPtr(), + (int*)mid_row_indices_->GetDataPtr(), + (int*)final_row_indices_->GetDataPtr(), total_token_, top_k, + hidden_size_, cu_stream); }; DispatchCUDA(dtype_, functor); break; @@ -240,5 +328,6 @@ AsStatus MoeOp::Forward() { } return AsStatus::ALLSPARK_SUCCESS; } - +REGISTER_OP(MOE, CUDA, MoeOp) +REGISTER_OP(MOE, CPU, MoeOp) } // namespace allspark diff --git a/csrc/core/operator/general/moe/moe_op.h b/csrc/core/operator/general/moe/moe_op.h index 7bb861b3..7f4ed3de 100644 --- a/csrc/core/operator/general/moe/moe_op.h +++ b/csrc/core/operator/general/moe/moe_op.h @@ -22,29 +22,56 @@ class MoeOp : public AsOperator { AsStatus Forward() override; private: +#ifdef ENABLE_CUDA + cudaDeviceProp dprop_; +#endif + bool use_dnn_ = false; DataType dtype_; int num_expert_; int num_expert_pertoken_; int total_token_; int hidden_size_; int64_t ws_size_; + int block_size_; int proj_size_; int64_t expert_size_; bool first_moe_ = true; + // use_dnn_ + std::unique_ptr mid_row_indices_; + std::unique_ptr mid_expert_indices_; + std::unique_ptr final_row_indices_; + std::unique_ptr hWs_; void* hWs; size_t hWsSize, dWsSize; + // use_dnn_over std::unique_ptr experts_score_; std::unique_ptr float_gate_score_; std::unique_ptr topk_value_; std::unique_ptr topk_indice_; - std::unique_ptr mid_row_indices_; - std::unique_ptr mid_expert_indices_; - std::unique_ptr final_row_indices_; + std::unique_ptr experts_idx_; + std::unique_ptr experts_seq_; + std::unique_ptr indice_source_; + std::unique_ptr total_tokens_post_pad_; + + std::unique_ptr gate_up_proj_array_ptr; + std::unique_ptr down_proj_array_ptr; + std::unique_ptr reorder_data_array_ptr; + std::unique_ptr gate_up_proj_out_array_ptr; + std::unique_ptr mid_result_array_ptr; + std::unique_ptr final_result_array_ptr; + // void* reorder_data; void* gate_up_proj_out; void* mid_result; void* final_result; + void* dnn_ws; + void** gate_up_proj_array; + void** down_proj_array; + void** reorder_data_array; + void** gate_up_proj_out_array; + void** mid_result_array; + void** final_result_array; }; } // namespace allspark diff --git a/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.cpp b/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.cpp deleted file mode 100644 index 381a9420..00000000 --- a/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.cpp +++ /dev/null @@ -1,309 +0,0 @@ -/*! - * Copyright (c) Alibaba, Inc. and its affiliates. - * @file moe_inefficient_op.cpp - */ - -#include "moe_inefficient_op.h" // NOLINT - -#include -#include -#ifdef ENABLE_CUDA -#include -#endif -#include - -int64_t get_max_block(int input_token, int num_expert, int num_expert_pertoken, - int block_size) { - int64_t max_token = (int64_t)input_token * num_expert_pertoken; - if (max_token < num_expert) { - // max_block = max_token - return max_token; - } - int64_t max_block = num_expert + (max_token) / block_size; - return max_block; -} -namespace allspark { -AsStatus MoeInefficientOp::Init(const OperatorProto& op_proto, - const DeviceContext& ctx, - const TensorMap& weights_map, - TensorMap* tensor_map) { - AS_CHECK_STATUS(AsOperator::Init(op_proto, ctx, weights_map, tensor_map)); - // type inference - dtype_ = tensor_map_->at(in_names_[0])->GetDataType(); - DeviceType backend = ctx.GetDeviceType(); - tensor_map_->at(out_names_[0])->SetDataType(dtype_); - // attr - auto& attr_map = op_proto.attr(); - if (attr_map.find("num_experts") == attr_map.end()) { - LOG(ERROR) << "MoeInefficientOp : can't find num_expert attribute." - << std::endl; - return AsStatus::ALLSPARK_PARAM_ERROR; - } - num_expert_ = *(int*)(attr_map.at("num_experts").c_str()); - if (attr_map.find("num_experts_per_tok") == attr_map.end()) { - LOG(ERROR) << "MoeInefficientOp : can't find num_expert_per_tok attribute." - << std::endl; - return AsStatus::ALLSPARK_PARAM_ERROR; - } - num_expert_pertoken_ = *(int*)(attr_map.at("num_experts_per_tok").c_str()); - - block_size_ = 64; - first_moe_ = true; - // default - float_gate_score_ = std::make_unique( - "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_}); - topk_value_ = std::make_unique( - "topk_value_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_}); - experts_score_ = std::make_unique( - "experts_score_", backend, DataType::FLOAT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - topk_indice_ = std::make_unique( - "topk_indice_", backend, DataType::INT32, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - experts_idx_ = std::make_unique( - "experts_idx_", backend, DataType::INT64, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - experts_seq_ = std::make_unique( - "experts_seq_", backend, DataType::INT64, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - indice_source_ = std::make_unique( - "indice_source_", backend, DataType::INT64, DataMode::DENSE, - Shape{ctx_->GetModelMaxLength(), num_expert_pertoken_}); - total_tokens_post_pad_ = - std::make_unique("total_tokens_post_pad_", backend, - DataType::INT32, DataMode::DENSE, Shape{1}); - - int64_t max_block = get_max_block(ctx_->GetModelMaxLength(), num_expert_, - num_expert_pertoken_, block_size_); - gate_up_proj_array_ptr = std::make_unique( - "gate_up_proj_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - down_proj_array_ptr = std::make_unique( - "down_proj_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - reorder_data_array_ptr = std::make_unique( - "reorder_data_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - gate_up_proj_out_array_ptr = std::make_unique( - "gate_up_proj_out_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - mid_result_array_ptr = std::make_unique( - "mid_result_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - final_result_array_ptr = std::make_unique( - "final_result_array_ptr", backend, DataType::INT64, DataMode::DENSE, - Shape{max_block}); - std::unique_ptr experts_num; - std::unique_ptr experts_seq; - hidden_size_ = weights_[0]->GetShape()[1]; - proj_size_ = weights_[0]->GetShape()[2] / 2; - return AsStatus::ALLSPARK_SUCCESS; -} -AsStatus MoeInefficientOp::Reshape() { - Shape out_shape = tensor_map_->at(in_names_[0])->GetShape(); - AS_CHECK_STATUS( - tensor_map_->at(out_names_[0])->SetShape(std::move(out_shape))); - - if (first_moe_) { - // only reshape once,for warmup - first_moe_ = false; - total_token_ = ctx_->GetModelMaxLength(); - int64_t max_block = get_max_block(total_token_, num_expert_, - num_expert_pertoken_, block_size_); - int64_t max_total_tokens = max_block * block_size_; - expert_size_ = (int64_t)hidden_size_ * proj_size_; - float_gate_score_->SetShape(Shape{total_token_, num_expert_}); - topk_value_->SetShape(Shape{total_token_, num_expert_}); - experts_score_->SetShape(Shape{total_token_, num_expert_pertoken_}); - topk_indice_->SetShape(Shape{total_token_, num_expert_pertoken_}); - experts_idx_->SetShape(Shape{max_block}); - experts_seq_->SetShape(Shape{max_total_tokens}); - indice_source_->SetShape(Shape{max_total_tokens}); - ws_size_ = 0; - -#ifdef ENABLE_CUDA - if (ctx_->GetDeviceType() == DeviceType::CUDA) { - size_t softmax_workspace = 0; - cuda::StridedSoftmaxGetWorkspaceSize( - &softmax_workspace, ctx_->GetModelMaxLength(), num_expert_); - AS_CHECK_STATUS( - tensor_map_->at("workspace") - ->SetShape(Shape{static_cast(softmax_workspace)})); - ws_size_ += softmax_workspace; - } -#endif - ws_size_ += - max_total_tokens * hidden_size_ * SizeofType(dtype_); // reorder_data - ws_size_ += - max_total_tokens * proj_size_ * SizeofType(dtype_); // up_proj_out - ws_size_ += max_total_tokens * proj_size_ * SizeofType(dtype_); // gate_out - ws_size_ += - max_total_tokens * proj_size_ * SizeofType(dtype_); // mid_result - ws_size_ += - max_total_tokens * hidden_size_ * SizeofType(dtype_); // final_result - AS_CHECK_STATUS(tensor_map_->at("workspace")->SetShape(Shape{(ws_size_)})); - AsTensor* in_tensor = tensor_map_->at(in_names_[0]).get(); - AsTensor* expert_weight_tensor = tensor_map_->at(in_names_[1]).get(); - AsTensor* gate_up_proj_weight_tensor = weights_[0]; - AsTensor* down_proj_weight_tensor = weights_[1]; - AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get(); - switch (ctx_->GetDeviceType()) { -#ifdef ENABLE_CUDA - case DeviceType::CUDA: { - int top_k = num_expert_pertoken_; - const CUDAContext* gpu_ctx = static_cast(ctx_); - cublasHandle_t cublas_handle = gpu_ctx->GetCublasHandle(); - cudaStream_t cu_stream = - static_cast(ctx_)->GetStream(); - auto functor = [&]() { - void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); - reorder_data = ws_ptr; - gate_up_proj_out = (char*)reorder_data + max_total_tokens * - hidden_size_ * - SizeofType(dtype_); - mid_result = (char*)gate_up_proj_out + - max_total_tokens * proj_size_ * 2 * SizeofType(dtype_); - final_result = (char*)mid_result + - max_total_tokens * proj_size_ * SizeofType(dtype_); - gate_up_proj_array = (void**)gate_up_proj_array_ptr->GetDataPtr(); - down_proj_array = (void**)down_proj_array_ptr->GetDataPtr(); - reorder_data_array = (void**)reorder_data_array_ptr->GetDataPtr(); - gate_up_proj_out_array = - (void**)gate_up_proj_out_array_ptr->GetDataPtr(); - mid_result_array = (void**)mid_result_array_ptr->GetDataPtr(); - final_result_array = (void**)final_result_array_ptr->GetDataPtr(); - cuda::MOEGetBatchArrayLauncher( - nullptr, nullptr, (T*)reorder_data, reorder_data_array, max_block, - block_size_ * hidden_size_, block_size_, cu_stream); - cuda::MOEGetBatchArrayLauncher( - nullptr, nullptr, (T*)gate_up_proj_out, gate_up_proj_out_array, - max_block, block_size_ * proj_size_ * 2, block_size_, cu_stream); - cuda::MOEGetBatchArrayLauncher( - nullptr, nullptr, (T*)mid_result, mid_result_array, max_block, - block_size_ * proj_size_, block_size_, cu_stream); - cuda::MOEGetBatchArrayLauncher( - nullptr, nullptr, (T*)final_result, final_result_array, max_block, - block_size_ * hidden_size_, block_size_, cu_stream); - }; - DispatchCUDA(dtype_, functor); - break; - } -#endif - case DeviceType::CPU: { - LOG(ERROR) << "MOE Operator does not support " - << "CPU" - << " device type" << std::endl; - return AsStatus::ALLSPARK_RUNTIME_ERROR; - } - default: - break; - } - } - - total_token_ = out_shape[0] * out_shape[1]; - return AsStatus::ALLSPARK_SUCCESS; -} -AsStatus MoeInefficientOp::Forward() { - AsTensor* in_tensor = tensor_map_->at(in_names_[0]).get(); - AsTensor* expert_weight_tensor = tensor_map_->at(in_names_[1]).get(); - AsTensor* gate_up_proj_weight_tensor = weights_[0]; - AsTensor* down_proj_weight_tensor = weights_[1]; - AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get(); - void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); - switch (ctx_->GetDeviceType()) { -#ifdef ENABLE_CUDA - case DeviceType::CUDA: { - int top_k = num_expert_pertoken_; - const CUDAContext* gpu_ctx = static_cast(ctx_); - cublasHandle_t cublas_handle = gpu_ctx->GetCublasHandle(); - cudaStream_t cu_stream = - static_cast(ctx_)->GetStream(); - auto functor = [&]() { - cuda::CastKernelLauncher((T*)expert_weight_tensor->GetDataPtr(), - (float*)float_gate_score_->GetDataPtr(), - expert_weight_tensor->GetShape().Count(), - cu_stream); - // cuda::StridedSoftmaxLauncher((float*)topk_value_->GetDataPtr(), - // (float*)float_gate_score_->GetDataPtr(), - // nullptr, nullptr, ws_ptr, ws_size_, - // total_token_, num_expert_, cu_stream); - cuda::SoftmaxLowReduceKernelLauncher( - (float*)float_gate_score_->GetDataPtr(), - (float*)topk_value_->GetDataPtr(), total_token_, num_expert_, - cu_stream); - cuda::TopKKernelLauncher((float*)experts_score_->GetDataPtr(), - (int*)topk_indice_->GetDataPtr(), - (float*)topk_value_->GetDataPtr(), - total_token_, num_expert_, top_k, cu_stream); - - // int total_token_post_pad = 0; - cuda::ReorderAndPaddingMOE( - (int64_t*)experts_idx_->GetDataPtr(), - (int64_t*)experts_seq_->GetDataPtr(), - (int64_t*)indice_source_->GetDataPtr(), - (int*)topk_indice_->GetDataPtr(), total_token_, num_expert_, top_k, - block_size_, (int*)total_tokens_post_pad_->GetDataPtr(), cu_stream); - - int* total_tokens_pad_ptr = (int*)total_tokens_post_pad_->GetDataPtr(); - int max_block = get_max_block(total_token_, num_expert_, - num_expert_pertoken_, block_size_); - int max_total_tokens = max_block * block_size_; - // LOG(INFO) << "max_block=" << max_block; - cuda::GetReorderData((T*)reorder_data, (T*)in_tensor->GetDataPtr(), - (int64_t*)experts_idx_->GetDataPtr(), - (int64_t*)experts_seq_->GetDataPtr(), - total_tokens_pad_ptr, max_total_tokens, - total_token_ * top_k, top_k, hidden_size_, - block_size_, cu_stream); - - cuda::MOEGetBatchArrayLauncher( - (int64_t*)experts_idx_->GetDataPtr(), total_tokens_pad_ptr, - (T*)gate_up_proj_weight_tensor->GetDataPtr(), gate_up_proj_array, - max_block, gate_up_proj_weight_tensor->GetShape().Count(1), - block_size_, cu_stream); - cuda::MOEGetBatchArrayLauncher( - (int64_t*)experts_idx_->GetDataPtr(), total_tokens_pad_ptr, - (T*)down_proj_weight_tensor->GetDataPtr(), down_proj_array, - max_block, down_proj_weight_tensor->GetShape().Count(1), - block_size_, cu_stream); - cuda::BatchGemmWraper(gate_up_proj_out_array, reorder_data_array, - gate_up_proj_array, block_size_, - proj_size_ * 2, hidden_size_, false, false, - 1.0f, 0.0f, hidden_size_, proj_size_ * 2, - proj_size_ * 2, max_block, cublas_handle); - cuda::UnaryGLUKernelLauncher((T*)mid_result, (T*)gate_up_proj_out, - max_total_tokens, proj_size_, - UnaryType::SILU, cu_stream); - // cuda::MulAndSilu((T*)mid_result, (T*)gate_out, (T*)up_proj_out, - // max_total_tokens, proj_size_, cu_stream); - cuda::BatchGemmWraper( - final_result_array, mid_result_array, down_proj_array, block_size_, - hidden_size_, proj_size_, false, false, 1.0f, 0.0f, proj_size_, - hidden_size_, hidden_size_, max_block, cublas_handle); - cuda::FinalizeMoeRoutingKernelLauncher( - (T*)out_tensor->GetDataPtr(), (T*)final_result, - (float*)experts_score_->GetDataPtr(), - (int64_t*)indice_source_->GetDataPtr(), - (int*)topk_indice_->GetDataPtr(), total_tokens_pad_ptr, - total_token_, top_k, hidden_size_, cu_stream); - }; - DispatchCUDA(dtype_, functor); - break; - } -#endif - case DeviceType::CPU: { - LOG(ERROR) << "MOE Operator does not support " - << "CPU" - << " device type" << std::endl; - return AsStatus::ALLSPARK_RUNTIME_ERROR; - } - default: - break; - } - return AsStatus::ALLSPARK_SUCCESS; -} -REGISTER_OP(MOE, CUDA, MoeInefficientOp) -REGISTER_OP(MOE, CPU, MoeInefficientOp) -} // namespace allspark diff --git a/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.h b/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.h deleted file mode 100644 index f6ce1dd6..00000000 --- a/csrc/core/operator/general/moe_inefficient/moe_inefficient_op.h +++ /dev/null @@ -1,62 +0,0 @@ -/*! - * Copyright (c) Alibaba, Inc. and its affiliates. - * @file moe_inefficient_op.h - */ - -#pragma once - -#include - -#include - -namespace allspark { - -class MoeInefficientOp : public AsOperator { - public: - explicit MoeInefficientOp(const std::string& op_type = "") - : AsOperator(op_type) {} - AsStatus Init(const OperatorProto& op_proto, const DeviceContext& ctx, - const TensorMap& weights_map, TensorMap* tensor_map); - AsStatus Reshape() override; - AsStatus Forward() override; - - private: - DataType dtype_; - int num_expert_; - int num_expert_pertoken_; - int total_token_; - int hidden_size_; - int64_t ws_size_; - int block_size_; - int proj_size_; - int64_t expert_size_; - bool first_moe_ = true; - std::unique_ptr experts_score_; - std::unique_ptr float_gate_score_; - std::unique_ptr topk_value_; - std::unique_ptr topk_indice_; - std::unique_ptr experts_idx_; - std::unique_ptr experts_seq_; - std::unique_ptr indice_source_; - std::unique_ptr total_tokens_post_pad_; - - std::unique_ptr gate_up_proj_array_ptr; - std::unique_ptr down_proj_array_ptr; - std::unique_ptr reorder_data_array_ptr; - std::unique_ptr gate_up_proj_out_array_ptr; - std::unique_ptr mid_result_array_ptr; - std::unique_ptr final_result_array_ptr; - // - void* reorder_data; - void* gate_up_proj_out; - void* mid_result; - void* final_result; - void** gate_up_proj_array; - void** down_proj_array; - void** reorder_data_array; - void** gate_up_proj_out_array; - void** mid_result_array; - void** final_result_array; -}; - -} // namespace allspark diff --git a/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.cpp b/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.cpp new file mode 100644 index 00000000..36a149e0 --- /dev/null +++ b/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.cpp @@ -0,0 +1,485 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file sgmv_lora_op_gpu.cpp + */ + +#ifdef ENABLE_CUDA +#include "sgmv_lora_op_gpu.h" + +#include +#include +#include +#include +#include + +#include + +#include "runtime/weight/weight_manager_lora.h" + +#define ALIGN_UP_TO_16(ptr) \ + reinterpret_cast((reinterpret_cast(ptr) + 15) & \ + ~uintptr_t(15)) + +#define INT_ROUND_UP_TO_16(num) (num + 15) & ~15 + +namespace allspark { +namespace cuda { +extern size_t sgmv_tmp_size(int num_problems); +} +} // namespace allspark + +namespace allspark { +AsStatus sgmv_cutlass(DataType dtype, void* out, const void* in, + const AsTensor* weight_ptrs, const AsTensor* segments, + const AsTensor* ranks, void* buf, int d_in, int d_out, + bool is_k_tensor, bool is_n_tensor, int num_problems, + bool unsplit, int unsplit_n, int max_rank, int CC, + const DeviceContext* ctx); + +AsStatus sgmv_split_qkv(DataType dtype, AsTensor* out_ptrs, const void* in, + const AsTensor* segments, const AsTensor* lora_B_ranks, + int max_rank, int num_problems, + const DeviceContext* ctx); + +AsStatus dense_gemm_rawptr(DataType dtype, void* out, const void* in, + const void* bias, const void* weight, int m, int n, + int k, int lda, int ldb, int ldc, bool transA, + bool transB, int batch, float alpha, + const void* binary_in, UnaryType activation, + const DeviceContext* ctx); + +AsStatus SgmvLoraOpGPU::Init(const OperatorProto& op_proto, + const DeviceContext& ctx, + const TensorMap& weights_map, + TensorMap* tensor_map) { + DLOG(INFO) << "SgmvLoraOpGPU::Init" << std::endl; + AS_CHECK_STATUS(AsOperator::Init(op_proto, ctx, weights_map, tensor_map)); + dtype_ = tensor_map_->at(in_names_[0])->GetDataType(); + tensor_map_->at(out_names_[0])->SetDataType(dtype_); + const int max_batch = ctx.GetModelMaxBatch(); + // check if this op is ...attention.self + if (weight_names_[0].rfind("attention.self") != std::string::npos) { + is_attention_self_ = true; + for (int i = 0; i < 3; i++) { + lora_B_weight_parts_vec_.emplace_back(std::make_shared( + "lora_B_weight_parts." + std::to_string(i), ctx.GetDeviceType(), + DataType::INT64, DataMode::DENSE, Shape{max_batch})); + } + } + AS_ENFORCE(ctx.GetDeviceType() == DeviceType::CUDA); // sgmv暂只支持CUDA + lora_A_weight_ptrs_ = std::make_shared( + "lora_A_weight_ptrs_", ctx.GetDeviceType(), DataType::INT64, + DataMode::DENSE, Shape{max_batch}); + lora_B_weight_ptrs_ = std::make_shared( + "lora_B_weight_ptrs_", ctx.GetDeviceType(), DataType::INT64, + DataMode::DENSE, Shape{max_batch}); + lora_ranks_ = std::make_shared("lora_ranks_", ctx.GetDeviceType(), + DataType::INT32, DataMode::DENSE, + Shape{max_batch}); + segments_ = std::make_shared("segments_", ctx.GetDeviceType(), + DataType::INT32, DataMode::DENSE, + Shape{max_batch * 2}); + if (is_attention_self_ == true) { + lora_B_ranks_ = std::make_shared( + "lora_B_ranks_", ctx.GetDeviceType(), DataType::INT32, DataMode::DENSE, + Shape{max_batch}); + temp_qkv_ptrs_ = + std::make_shared("temp_qkv_ptrs", ctx.GetDeviceType(), + DataType::INT64, DataMode::DENSE, Shape{3}); + int nslice = ctx.GetNranks(); + AS_ENFORCE(ctx.GetNumberHeads() * ctx.GetSizePerHead() % nslice == 0); + AS_ENFORCE(ctx.GetNumberGroups() * ctx.GetSizePerHead() % nslice == 0); + q_outdim_size_ = ctx.GetNumberHeads() * ctx.GetSizePerHead() / nslice; + k_outdim_size_ = ctx.GetNumberGroups() * ctx.GetSizePerHead() / nslice; + v_outdim_size_ = k_outdim_size_; + qkv_weight_dims_.emplace_back(q_outdim_size_); + qkv_weight_dims_.emplace_back(k_outdim_size_); + qkv_weight_dims_.emplace_back(v_outdim_size_); + qkv_sum_ = q_outdim_size_ + k_outdim_size_ + v_outdim_size_; + } + kernel_launcher = sgmv_cutlass; + + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, 0) != cudaSuccess) { + throw AsException("Error getting device properties in SgmvLoraOpGPU::Init"); + } + CC_ = prop.major * 10 + prop.minor; + return AsStatus::ALLSPARK_SUCCESS; +} + +AsStatus SgmvLoraOpGPU::Reshape(RuntimeContext* runtime_ctx) { + DLOG(INFO) << "SgmvLoraOpGPU::Reshape" << std::endl; + + // reset + max_lora_r_ = 0; + need_set_zero_ = false; + use_cublas_ = false; + lora_A_weight_ptrs_vec_.clear(); + lora_B_weight_ptrs_vec_.clear(); + lora_B_weight_parts_data_ptrs_[0].clear(); // q + lora_B_weight_parts_data_ptrs_[1].clear(); // k + lora_B_weight_parts_data_ptrs_[2].clear(); // v + lora_A_ranks_vec_.clear(); + lora_B_ranks_vec_.clear(); + segmented_batch_idx_.clear(); + + // 根据batch里req用到的lora,把不在cuda中的lora换入 + auto batchsize = + runtime_ctx->is_context ? 1 : runtime_ctx->GetGenCtxListSize(); + + // 用于batch_size维度给输入分段 + std::string last_lora_name = ""; + std::vector lora_A_weight_data_ptrs; + std::vector lora_B_weight_data_ptrs; + + const Shape& in_shape = tensor_map_->at(in_names_[0])->GetShape(); + int in_dims = in_shape.Size(); + AS_ENFORCE(in_dims == 3 && + batchsize == in_shape[0]); // {bs, seq_len, in_features} + + auto seq_len = in_shape[1]; + auto in_features = in_shape[in_dims - 1]; + int64_t out_features = 0; + + // 计算weight指针和分段batch idx + // batch中至少有一个请求带lora,不然本算子不会被调用 + // weight_names_[0] = "...lora_A.weight" weight_names_[1] = "...lora_B.weight" + for (auto i = 0; i < batchsize; i++) { + GenerateContext* gen_ctx = runtime_ctx->is_context + ? runtime_ctx->GetContextGenCtx() + : runtime_ctx->GetGenCtx(i); + auto lora_name = gen_ctx->gen_cfg.lora_name; + if (lora_name.empty()) { // 加载了lora权重 但请求中可以不使用 + need_set_zero_ = true; + if (last_lora_name.empty() == false) { + last_lora_name = ""; + // 前面一直带有lora,当前batch不带lora,增加end index + segmented_batch_idx_.emplace_back(i); + } + continue; + } + auto lora_weight_handle = lora_manager_->GetHandleByName(lora_name); + + // 获取weights + auto& lora_A_weight_name = weight_names_[0]; + auto lora_A_weight_p = + lora_manager_->GetLoraTensorByName(lora_name, lora_A_weight_name); + auto& lora_B_weight_name = weight_names_[1]; + auto lora_B_weight_p = + lora_manager_->GetLoraTensorByName(lora_name, lora_B_weight_name); + + // check bias + bool has_lora_bias = + lora_manager_->HasLoraBias(lora_name, lora_A_weight_name); + has_lora_bias = has_lora_bias ? true + : lora_manager_->HasLoraBias( + lora_name, lora_B_weight_name); + if (has_lora_bias == true) { + throw AsException("SgmvLoraOp does not support lora with bias!"); + } + + AS_ENFORCE(in_features == lora_A_weight_p->GetShape()[0]); + if (out_features == 0) { + out_features = lora_B_weight_p->GetShape()[1]; + if (is_attention_self_ == true) { + AS_ENFORCE(out_features == + q_outdim_size_ + k_outdim_size_ + v_outdim_size_); + } + } + // auto lora_r = std::min(weight_tensor_p->GetShape()[0], + // weight_tensor_p->GetShape()[1]); + auto lora_r = lora_A_weight_p->GetShape()[1]; + if (lora_B_weight_p->GetShape()[0] < 8 && use_cublas_ == false) { + // lora_B weight rank < 8启用cublas + use_cublas_ = true; + } + if (lora_r >= max_lora_r_) { + max_lora_r_ = lora_r; + } + + if (last_lora_name != lora_name) { + if (last_lora_name.empty() == false) { + // 前面一直带有lora,当前batch更换lora_name,增加end index + segmented_batch_idx_.emplace_back(i); + } + last_lora_name = lora_name; + lora_A_weight_data_ptrs.emplace_back( + (int64_t) + lora_A_weight_p->GetDataPtr()); // 可能有重复的weight,但没有影响 + lora_B_weight_data_ptrs.emplace_back( + (int64_t)lora_B_weight_p->GetDataPtr()); + if (is_attention_self_ == true) { + // attention.self处lora算子,需要列向拆分lora_B.weight{rank, + // out_features}为qkv三份 + int lora_B_rank = lora_B_weight_p->GetShape()[0]; + AS_ENFORCE(lora_B_rank * 3 == lora_r); + lora_B_ranks_vec_.emplace_back(lora_B_rank); + for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { + lora_B_weight_parts_data_ptrs_[qkv_idx].emplace_back( + (int64_t)((char*)lora_B_weight_p->GetDataPtr() + + std::accumulate(qkv_weight_dims_.begin(), + qkv_weight_dims_.begin() + qkv_idx, 0) * + SizeofType(dtype_))); + } + } + lora_A_ranks_vec_.emplace_back(lora_r); + segmented_batch_idx_.emplace_back(i); // 新的starting batch + lora_A_weight_ptrs_vec_.emplace_back(lora_A_weight_p.get()); + lora_B_weight_ptrs_vec_.emplace_back(lora_B_weight_p.get()); + } + if (i == batchsize - 1) { + if (last_lora_name.empty() == false) { + // 遍历即将结束,当前的lora请求还没有记录end index + segmented_batch_idx_.emplace_back(batchsize); + } + } + } + AS_ENFORCE(segmented_batch_idx_.size() == lora_A_weight_data_ptrs.size() * 2); + num_problems_ = lora_A_weight_data_ptrs.size(); + + if (runtime_ctx->is_context == true) { + // context阶段bs=1,sgmv的m应该是seq_len + segmented_batch_idx_[0] = 0; + segmented_batch_idx_[1] = seq_len; + } + + // Calculate workspace size + buf_size_ = cuda::sgmv_tmp_size(num_problems_); + ws_size_ = 0; + ws_size_ += + batchsize * seq_len * max_lora_r_ * SizeofType(dtype_); // temp_(dtype) + ws_size_ = INT_ROUND_UP_TO_16(ws_size_); + ws_size_ += buf_size_; // buf_(uint8) + if (is_attention_self_ == true) { + ws_size_ = INT_ROUND_UP_TO_16(ws_size_); + ws_size_ += batchsize * seq_len * (max_lora_r_ / 3) * + SizeofType(dtype_); // temp_qkv_[0](dtype) + ws_size_ = INT_ROUND_UP_TO_16(ws_size_); + ws_size_ += batchsize * seq_len * (max_lora_r_ / 3) * + SizeofType(dtype_); // temp_qkv_[1](dtype) + ws_size_ = INT_ROUND_UP_TO_16(ws_size_); + ws_size_ += batchsize * seq_len * (max_lora_r_ / 3) * + SizeofType(dtype_); // temp_qkv_[2](dtype) + } + AS_CHECK_STATUS(tensor_map_->at("workspace")->SetShape(Shape{(ws_size_)})); + + // Reshape partial tensors + lora_A_weight_ptrs_->SetShape(Shape{lora_A_weight_data_ptrs.size()}); + lora_B_weight_ptrs_->SetShape(Shape{lora_B_weight_data_ptrs.size()}); + lora_ranks_->SetShape(Shape{lora_A_ranks_vec_.size()}); + segments_->SetShape(Shape{segmented_batch_idx_.size()}); + tensor_map_->at(out_names_[0]) + ->SetShape(Shape{batchsize, seq_len, out_features}); + if (is_attention_self_ == true) { + lora_B_ranks_->SetShape(Shape{lora_B_ranks_vec_.size()}); + for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { + lora_B_weight_parts_vec_[qkv_idx]->SetShape( + Shape{lora_B_weight_parts_data_ptrs_[qkv_idx].size()}); + } + } + + // Copy data + // Init已经确保了使用CUDA + CopyData(lora_A_weight_ptrs_->GetDataPtr(), DeviceType::CUDA, + lora_A_weight_data_ptrs.data(), DeviceType::CPU, + sizeof(int64_t) * lora_A_weight_data_ptrs.size(), ctx_); + CopyData(lora_B_weight_ptrs_->GetDataPtr(), DeviceType::CUDA, + lora_B_weight_data_ptrs.data(), DeviceType::CPU, + sizeof(int64_t) * lora_B_weight_data_ptrs.size(), ctx_); + CopyData(lora_ranks_->GetDataPtr(), DeviceType::CUDA, + lora_A_ranks_vec_.data(), DeviceType::CPU, + sizeof(int32_t) * lora_A_ranks_vec_.size(), ctx_); + CopyData(segments_->GetDataPtr(), DeviceType::CUDA, + segmented_batch_idx_.data(), DeviceType::CPU, + sizeof(int32_t) * segmented_batch_idx_.size(), ctx_); + if (is_attention_self_ == true) { + CopyData(lora_B_ranks_->GetDataPtr(), DeviceType::CUDA, + lora_B_ranks_vec_.data(), DeviceType::CPU, + sizeof(int32_t) * lora_B_ranks_vec_.size(), ctx_); + for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { + CopyData(lora_B_weight_parts_vec_[qkv_idx]->GetDataPtr(), + DeviceType::CUDA, lora_B_weight_parts_data_ptrs_[qkv_idx].data(), + DeviceType::CPU, + sizeof(int64_t) * lora_B_weight_parts_data_ptrs_[qkv_idx].size(), + ctx_); + } + } + + // 获取batch间一致的矩阵维数 + lora_A_d_in_ = in_features; + lora_A_d_out_ = 0; // fill by ranks tensor + lora_B_d_in_ = 0; // fill by ranks tensor + lora_B_d_out_ = out_features; + + return AsStatus::ALLSPARK_SUCCESS; +} + +AsStatus SgmvLoraOpGPU::Forward(RuntimeContext* runtime_ctx) { + DLOG(INFO) << "SgmvLoraOpGPU::Forward" << std::endl; + AsTensor* batch_in_tensor = tensor_map_->at(in_names_[0]).get(); + AsTensor* batch_out_tensor = tensor_map_->at(out_names_[0]).get(); + void* in = batch_in_tensor->GetDataPtr(); + void* out = batch_out_tensor->GetDataPtr(); + + const auto& shape_in = batch_in_tensor->GetShape(); + const auto& shape_out = batch_out_tensor->GetShape(); + AS_ENFORCE(shape_in[0] == shape_out[0]); + auto ndims_in = shape_in.Size(); + auto ndims_out = shape_out.Size(); + AS_ENFORCE(ndims_in == 3 && ndims_in == ndims_out); + auto batchsize = shape_in[0]; + auto seq_len = shape_in[1]; + + const CUDAContext* gpu_ctx = static_cast(ctx_); + cudaStream_t cu_stream = gpu_ctx->GetStream(); + + // Calculate addresses + void* ws_ptr = tensor_map_->at("workspace")->GetDataPtr(); + temp_ = ws_ptr; + buf_ = (char*)temp_ + batchsize * seq_len * max_lora_r_ * SizeofType(dtype_); + buf_ = ALIGN_UP_TO_16(buf_); + if (is_attention_self_ == true) { + temp_qkv_[0] = (char*)buf_ + buf_size_; + temp_qkv_[0] = ALIGN_UP_TO_16(temp_qkv_[0]); + temp_qkv_[1] = (char*)temp_qkv_[0] + + batchsize * seq_len * (max_lora_r_ / 3) * SizeofType(dtype_); + temp_qkv_[1] = ALIGN_UP_TO_16(temp_qkv_[1]); + temp_qkv_[2] = (char*)temp_qkv_[1] + + batchsize * seq_len * (max_lora_r_ / 3) * SizeofType(dtype_); + temp_qkv_[2] = ALIGN_UP_TO_16(temp_qkv_[2]); + CopyData(temp_qkv_ptrs_->GetDataPtr(), DeviceType::CUDA, temp_qkv_.data(), + DeviceType::CPU, sizeof(int64_t) * 3, ctx_); + } + + if (use_cublas_ == true) { + // 存在lora weight rank < 8的请求,使用cublas kernel + for (int i = 0; i < num_problems_; i++) { + int batch_start = segmented_batch_idx_[i * 2]; + int batch_end = segmented_batch_idx_[i * 2 + 1]; + int out_byte_offset = max_lora_r_ * batch_start * SizeofType(dtype_); + int in_byte_offset = shape_in[-1] * batch_start * SizeofType(dtype_); + + int m = batch_end - batch_start; + int n = lora_A_ranks_vec_[i]; + int k = lora_A_d_in_; + AS_CHECK_STATUS( + dense_gemm_rawptr(dtype_, (void*)((char*)temp_ + out_byte_offset), + (const void*)((char*)in + in_byte_offset), nullptr, + lora_A_weight_ptrs_vec_[i]->GetDataPtr(), m, n, k, + k, n, n, false, false, 1, 1.0 /* default alpha */, + nullptr, UNARYTYPE_UNDEFINED, ctx_)); + } + } else { + AS_CHECK_STATUS(kernel_launcher( + dtype_, temp_, batch_in_tensor->GetDataPtr(), lora_A_weight_ptrs_.get(), + segments_.get(), lora_ranks_.get(), buf_, lora_A_d_in_, lora_A_d_out_, + false, true, num_problems_, false, 0, max_lora_r_, CC_, ctx_)); + } + + if (is_attention_self_ == true) { + sgmv_split_qkv(dtype_, temp_qkv_ptrs_.get(), temp_, segments_.get(), + lora_B_ranks_.get(), max_lora_r_, num_problems_, ctx_); + for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { + // output_parts_[qkv_idx] = temp_ @ lora_B_weight_part[qkv_idx], shape: + // {bs, seq_len, qkv_weight_dims_[qkv_idx]} + if (use_cublas_ == true) { + for (int i = 0; i < num_problems_; i++) { + int batch_start = segmented_batch_idx_[i * 2]; + int batch_end = segmented_batch_idx_[i * 2 + 1]; + int out_byte_offset = qkv_sum_ * batch_start * SizeofType(dtype_); + int in_byte_offset = + (max_lora_r_ / 3) * batch_start * SizeofType(dtype_); + + int m = batch_end - batch_start; + int n = qkv_weight_dims_[qkv_idx]; + int k = lora_B_ranks_vec_[i]; + AS_CHECK_STATUS(dense_gemm_rawptr( + dtype_, + (void*)((char*)batch_out_tensor->GetDataPtr() + out_byte_offset + + std::accumulate(qkv_weight_dims_.begin(), + qkv_weight_dims_.begin() + qkv_idx, 0) * + SizeofType(dtype_)), + (const void*)((char*)temp_qkv_[qkv_idx] + in_byte_offset), + nullptr, + (void*)(lora_B_weight_parts_data_ptrs_[qkv_idx][i] + + std::accumulate(qkv_weight_dims_.begin(), + qkv_weight_dims_.begin() + qkv_idx, 0) * + SizeofType(dtype_)), + m, n, k, k, qkv_sum_, qkv_sum_, false, false, 1, + 1.0 /* default alpha */, nullptr, UNARYTYPE_UNDEFINED, ctx_)); + } + } else { + AS_CHECK_STATUS(kernel_launcher( + dtype_, + (void*)((char*)batch_out_tensor->GetDataPtr() + + std::accumulate(qkv_weight_dims_.begin(), + qkv_weight_dims_.begin() + qkv_idx, 0) * + SizeofType(dtype_)), + temp_qkv_[qkv_idx], lora_B_weight_parts_vec_[qkv_idx].get(), + segments_.get(), lora_B_ranks_.get(), buf_, lora_B_d_in_, + qkv_weight_dims_[qkv_idx], true, false, num_problems_, true, + qkv_sum_, max_lora_r_, CC_, ctx_)); + } + } + } else { + // batch_out_tensor = temp_ @ lora_B, shape: {bs, seq_len, out_features} + if (use_cublas_ == true) { + for (int i = 0; i < num_problems_; i++) { + int batch_start = segmented_batch_idx_[i * 2]; + int batch_end = segmented_batch_idx_[i * 2 + 1]; + int out_byte_offset = + batch_out_tensor->GetShape()[-1] * batch_start * SizeofType(dtype_); + int in_byte_offset = max_lora_r_ * batch_start * SizeofType(dtype_); + + int m = batch_end - batch_start; + int n = lora_B_d_out_; + int k = lora_A_ranks_vec_[i]; // 非attention.self的场景rank_A == rank_B + AS_CHECK_STATUS(dense_gemm_rawptr( + dtype_, + (void*)((char*)batch_out_tensor->GetDataPtr() + out_byte_offset), + (const void*)((char*)temp_ + in_byte_offset), nullptr, + lora_B_weight_ptrs_vec_[i]->GetDataPtr(), m, n, k, k, n, n, false, + false, 1, 1.0 /* default alpha */, nullptr, UNARYTYPE_UNDEFINED, + ctx_)); + } + } else { + AS_CHECK_STATUS(kernel_launcher( + dtype_, batch_out_tensor->GetDataPtr(), temp_, + lora_B_weight_ptrs_.get(), segments_.get(), lora_ranks_.get(), buf_, + lora_B_d_in_, lora_B_d_out_, true, false, num_problems_, false, 0, + max_lora_r_, CC_, ctx_)); + } + } + + if (need_set_zero_ == true) { + // batch有请求不带lora_name + int last_end_idx = 0; + for (int i = 0; i < segmented_batch_idx_.size() - 1; i += 2) { + if (segmented_batch_idx_[i] != last_end_idx) { + // 这段tensor需要赋0 + int row_count = segmented_batch_idx_[i] - last_end_idx; + AS_CHECK_CUDA(cudaMemsetAsync( + (char*)batch_out_tensor->GetDataPtr() + + last_end_idx * lora_B_d_out_ * SizeofType(dtype_), + 0, SizeofType(dtype_) * lora_B_d_out_ * row_count, cu_stream)); + } + last_end_idx = segmented_batch_idx_[i + 1]; + } + if (segmented_batch_idx_[segmented_batch_idx_.size() - 1] != batchsize) { + // batch的最后需要赋0 + int row_count = + batchsize - segmented_batch_idx_[segmented_batch_idx_.size() - 1]; + AS_CHECK_CUDA(cudaMemsetAsync( + (char*)batch_out_tensor->GetDataPtr() + + segmented_batch_idx_[segmented_batch_idx_.size() - 1] * + lora_B_d_out_ * SizeofType(dtype_), + 0, SizeofType(dtype_) * lora_B_d_out_ * row_count, cu_stream)); + } + } + + return AsStatus::ALLSPARK_SUCCESS; +} + +REGISTER_OP(SgmvLora, CUDA, SgmvLoraOpGPU) +} // namespace allspark +#endif diff --git a/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.h b/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.h new file mode 100644 index 00000000..e3dca55f --- /dev/null +++ b/csrc/core/operator/general/sgmv_lora/sgmv_lora_op_gpu.h @@ -0,0 +1,108 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file sgmv_lora_op_gpu.h + */ + +#pragma once +#ifdef ENABLE_CUDA +#include +#include +#include + +namespace allspark { +class SgmvLoraOpGPU : public AsOperator { + public: + SgmvLoraOpGPU(const std::string& op_type = "") : AsOperator(op_type) { + is_lora_op_ = true; + temp_qkv_.resize(3); + lora_B_weight_parts_data_ptrs_.resize(3); + } + AsStatus Init(const OperatorProto& op_proto, const DeviceContext& ctx, + const TensorMap& weights_map, TensorMap* tensor_map) override; + AsStatus Reshape(RuntimeContext* runtime_ctx) override; + AsStatus Forward(RuntimeContext* runtime_ctx) override; + + private: + AsStatus (*kernel_launcher)(DataType dtype, void* out, const void* in, + const AsTensor* weight_ptrs, + const AsTensor* segments, const AsTensor* ranks, + void* buf, int d_in, int d_out, bool is_k_tensor, + bool is_n_tensor, int num_problems, bool unsplit, + int unsplit_n, int max_rank, int CC, + const DeviceContext* ctx) = nullptr; + // type: int64, shape: {>=#lora_weights}(may have duplicate ptr) + std::shared_ptr lora_A_weight_ptrs_; + std::shared_ptr lora_B_weight_ptrs_; + + // type: int32, shape: {lora_A/B_weight_ptrs_.shape[0]} + std::shared_ptr lora_ranks_; + + // for attention.self + // type: int32, shape: {lora_B_weight_ptrs_.shape[0]} + std::shared_ptr lora_B_ranks_; + + // type: int32, shape: {lora_A/B_weight_ptrs_.shape[0] * 2} + std::shared_ptr segments_; + // std::vector segments_; + // segments_[weight_idx * 2] == starting batch idx + // segments_[weight_idx * 2 + 1] == ending batch idx + 1 + + // type: dtype, shape: {bs, seq_len, max_rank} + // temp_ = input @ lora_A + void* temp_; + + // type: dtype, shape: {bs, seq_len, max_rank / 3} + std::shared_ptr temp_qkv_ptrs_; + std::vector temp_qkv_; + + // type: uint8, shape: {sgmv_tmp_size} + void* buf_; + + // store 3 weight pointer tensors, each: + // type: int64_t, shape: {lora_B_weight_ptrs_.shape[0]} + std::vector> lora_B_weight_parts_vec_; + + int num_problems_ = 0; + int max_lora_r_ = 0; + float lora_scaling_ = 1.0; + bool has_lora_bias_ = false; + bool is_attention_self_ = false; + + // quant BEGIN + // std::string inner_gemm_type_ = "Gemm"; + bool use_quant_ = false; + // OperatorProto quant_op_proto_; + // quant END + + int lora_A_d_in_; + int lora_A_d_out_; + + int lora_B_d_in_; + int lora_B_d_out_; + + std::vector lora_A_weight_ptrs_vec_; + std::vector lora_B_weight_ptrs_vec_; + std::vector> lora_B_weight_parts_data_ptrs_; + std::vector lora_A_ranks_vec_; + std::vector lora_B_ranks_vec_; + std::vector segmented_batch_idx_; + + bool use_cublas_ = false; + + // batch中有不带lora_name的请求,output tensor相应位置需赋0 + bool need_set_zero_ = false; + + int CC_; // compute capability + + DataType dtype_ = DATATYPE_UNDEFINED; + + // col-wise split lora_B.weight to 3 parts for qkv + dim_t q_outdim_size_{0}, k_outdim_size_{0}, v_outdim_size_{0}; + std::vector qkv_weight_dims_; + int qkv_sum_ = 0; + + int64_t ws_size_ = 0; + size_t buf_size_ = 0; +}; +} // namespace allspark +#endif diff --git a/csrc/core/operator/general/sgmv_lora/sgmv_op_gpu.cpp b/csrc/core/operator/general/sgmv_lora/sgmv_op_gpu.cpp new file mode 100644 index 00000000..534c9019 --- /dev/null +++ b/csrc/core/operator/general/sgmv_lora/sgmv_op_gpu.cpp @@ -0,0 +1,96 @@ +/*! + * Copyright (c) Alibaba, Inc. and its affiliates. + * @file sgmv_op_gpu.cpp + */ + +#ifdef ENABLE_CUDA +#include +#include +#include +#include +#include +#include + +#include +#ifdef ENABLE_FP16 +#ifdef ENABLE_CUDA +#include +#else +#include +#endif +#endif +#ifdef ENABLE_BF16 +#include +#endif + +namespace allspark { +AsStatus sgmv_cutlass(DataType dtype, void* out, const void* in, + const AsTensor* weight_ptrs, const AsTensor* segments, + const AsTensor* ranks, void* buf, int d_in, int d_out, + bool is_k_tensor, bool is_n_tensor, int num_problems, + bool unsplit, int unsplit_n, int max_rank, int CC, + const DeviceContext* ctx) { + const CUDAContext* gpu_ctx = static_cast(ctx); + cudaStream_t cu_stream = gpu_ctx->GetStream(); + + auto functor = [&]() { + T* typed_out = static_cast(out); + const T* typed_input = static_cast(in); + const T** typed_weight_ptrs = + static_cast(weight_ptrs->GetDataPtr()); + const int32_t* typed_segments = + static_cast(segments->GetDataPtr()); + const int32_t* typed_ranks = + static_cast(ranks->GetDataPtr()); + cuda::SgmvCutlass(typed_out, typed_input, typed_weight_ptrs, + typed_segments, typed_ranks, buf, is_k_tensor, + is_n_tensor, num_problems, d_in, d_out, unsplit, + unsplit_n, max_rank, CC, cu_stream); + }; + // dispatch + switch (dtype) { +#ifdef ENABLE_FP16 + case DataType::FLOAT16: { + std::forward(functor).template operator()(); + break; + } +#endif +#ifdef ENABLE_BF16 + case DataType::BFLOAT16: { + std::forward(functor) + .template operator()(); + break; + } +#endif + default: { + LOG(ERROR) << "unsupported datatype " << DataType_Name(dtype) + << " for SgmvCutlass"; + throw AsException("ALLSPARK_RUNTIME_ERROR"); + } + } + return AsStatus::ALLSPARK_SUCCESS; +} + +AsStatus sgmv_split_qkv(DataType dtype, AsTensor* out_ptrs, const void* in, + const AsTensor* segments, const AsTensor* lora_B_ranks, + int max_rank, int num_problems, + const DeviceContext* ctx) { + const CUDAContext* gpu_ctx = static_cast(ctx); + cudaStream_t cu_stream = gpu_ctx->GetStream(); + + auto functor = [&]() { + T** typed_out_ptrs = static_cast(out_ptrs->GetDataPtr()); + const T* typed_input = static_cast(in); + const int32_t* typed_segments = + static_cast(segments->GetDataPtr()); + const int32_t* typed_lora_B_ranks = + static_cast(lora_B_ranks->GetDataPtr()); + cuda::SgmvSplitQKV(typed_out_ptrs, typed_input, typed_segments, + typed_lora_B_ranks, max_rank, num_problems, + cu_stream); + }; + DispatchCUDA(dtype, functor); + return AsStatus::ALLSPARK_SUCCESS; +} +} // namespace allspark +#endif diff --git a/csrc/core/operator/generate_opt/batch_mha/batch_mha_op.h b/csrc/core/operator/generate_opt/batch_mha/batch_mha_op.h index 6f77cb14..2de09c9d 100644 --- a/csrc/core/operator/generate_opt/batch_mha/batch_mha_op.h +++ b/csrc/core/operator/generate_opt/batch_mha/batch_mha_op.h @@ -6,11 +6,6 @@ #pragma once #include -#if 0 // def ENABLE_CUDA -#include -#include -#include -#endif // ENABLE_CUDA #include "env_config.h" diff --git a/csrc/core/operator/generate_opt/generate/generate_op.cpp b/csrc/core/operator/generate_opt/generate/generate_op.cpp index ddeed3a1..a82a28b8 100644 --- a/csrc/core/operator/generate_opt/generate/generate_op.cpp +++ b/csrc/core/operator/generate_opt/generate/generate_op.cpp @@ -224,10 +224,10 @@ AsStatus GenerateOp::Init(const OperatorProto& op_proto, tensor_map_->at(out_names_[3])->SetDataType(dtype_); } presence_penalty_list = std::make_unique( - "repetition_penalty_list", backend, DataType::FLOAT32, DataMode::DENSE, + "presence_penalty_list", backend, DataType::FLOAT32, DataMode::DENSE, Shape{max_batch}); repetition_penalty_list = std::make_unique( - "presence_penalty_list", backend, DataType::FLOAT32, DataMode::DENSE, + "repetition_penalty_list", backend, DataType::FLOAT32, DataMode::DENSE, Shape{max_batch}); frequency_penalty_list = std::make_unique( "frequency_penalty_list", backend, DataType::FLOAT32, DataMode::DENSE, diff --git a/csrc/core/operator/generate_opt/span_attn/span_attn_op_cuda.h b/csrc/core/operator/generate_opt/span_attn/span_attn_op_cuda.h index 391763b7..54b294f7 100644 --- a/csrc/core/operator/generate_opt/span_attn/span_attn_op_cuda.h +++ b/csrc/core/operator/generate_opt/span_attn/span_attn_op_cuda.h @@ -6,12 +6,12 @@ #include #include -#include #include #include "core/kernel/kernel.h" #include "cuda/cuda_kernel_span_cache.h" -#include "span_attn_op.h" // NOLINT +#include "span_attn_op.h" // NOLINT +#include namespace allspark { diff --git a/csrc/core/operator/nccl/allgather/allgather_op.cpp b/csrc/core/operator/nccl/allgather/allgather_op.cpp index d212ffa3..fba57eae 100644 --- a/csrc/core/operator/nccl/allgather/allgather_op.cpp +++ b/csrc/core/operator/nccl/allgather/allgather_op.cpp @@ -35,6 +35,8 @@ void nccl_allgather_launcher(DataType dtype, void* out, void* in, if (nranks > 1) { ncclDataType_t nccl_dtype = GetNcclType(dtype); + // sync to fix hang issue in some devices, no impact on performance + cuda_ctx->Synchronize(); AS_CHECK_NCCL(ncclAllGather(in, tmp_data, count, nccl_dtype, cuda_ctx->GetNCCLComm(), cuda_ctx->GetStream())); @@ -46,6 +48,8 @@ void nccl_allgather_launcher(DataType dtype, void* out, void* in, cuda_ctx->GetStream()); }; DispatchCUDA(dtype, functor); + // sync to fix hang issue in some devices, no impact on performance + cuda_ctx->Synchronize(); } else { AS_CHECK_CUDA(cudaMemcpyAsync(out, in, count * SizeofType(dtype), cudaMemcpyDeviceToDevice, diff --git a/csrc/core/operator/operator.cpp b/csrc/core/operator/operator.cpp index 5ba71e73..224a027e 100644 --- a/csrc/core/operator/operator.cpp +++ b/csrc/core/operator/operator.cpp @@ -6,6 +6,10 @@ #include "operator.h" // NOLINT #ifdef ENABLE_CUDA #include +#include +#ifdef ENABLE_BF16 +#include +#endif #endif #include #include @@ -15,6 +19,21 @@ #include namespace allspark { +#ifdef ENABLE_CUDA +void ValidateNumeric(const AsTensor* tensor_ptr) { + void* p = tensor_ptr->GetDataPtr(); + size_t size = tensor_ptr->GetSizeInByte(); + if (tensor_ptr->GetDeviceType() == DeviceType::CUDA) { + if (tensor_ptr->GetDataType() == DataType::FLOAT16) + cuda_util::CheckInfNan<__half>((half*)p, size); +#ifdef ENABLE_BF16 + else if (tensor_ptr->GetDataType() == DataType::BFLOAT16) + cuda_util::CheckInfNan<__nv_bfloat16>((__nv_bfloat16*)p, size); +#endif + } +} +#endif + std::map DNNLOpContext::unary_algo_map_ = { {UnaryType::TANH, dnnl::algorithm::eltwise_tanh}, {UnaryType::GELU_ERF, dnnl::algorithm::eltwise_gelu_erf}, @@ -71,14 +90,29 @@ void AsOperator::PrintInformation() { std::cout << "op_inputs:" << std::endl; for (auto& v : in_names_) { std::cout << tensor_map_->at(v)->ToString() << std::endl; +#ifdef ENABLE_CUDA + if (backend == DeviceType::CUDA) { + ValidateNumeric(tensor_map_->at(v).get()); + } +#endif } std::cout << "op_weights:" << std::endl; for (auto& v : weights_) { std::cout << v->ToString() << std::endl; +#ifdef ENABLE_CUDA + if (backend == DeviceType::CUDA) { + ValidateNumeric(v); + } +#endif } std::cout << "op_outputs:" << std::endl; for (auto& v : out_names_) { std::cout << tensor_map_->at(v)->ToString() << std::endl; +#ifdef ENABLE_CUDA + if (backend == DeviceType::CUDA) { + ValidateNumeric(tensor_map_->at(v).get()); + } +#endif } std::cout << std::endl; } @@ -268,6 +302,7 @@ AsStatus AsOperator::Init(const OperatorProto& op_proto, tensor_map_->insert(std::make_pair( t_name, std::make_unique(t, ctx.GetDeviceType()))); } + if (is_lora_op_ && out_names_.size() > 0) continue; out_names_.emplace_back(t_name); } for (auto& t : op_proto.weights()) { @@ -276,10 +311,9 @@ AsStatus AsOperator::Init(const OperatorProto& op_proto, // only in order to make it can be run by GemmOpBase::Init && Reshape, // which is called by GemmLora weight_names_.emplace_back(t_name); - fake_weight_ = std::make_unique( + weights_.emplace_back(new AsTensor( // 这里不能使用智能指针 t_name, DeviceType::CUDA, DataType::INT8, DataMode::DENSE, - Shape({1, 3})); // 3 for qkv shape check - weights_.emplace_back(fake_weight_.get()); + Shape({1, 3}))); // 3 for qkv shape check } else if (weight_manager_) { auto weight_tensor_p = weight_manager_->GetWeightTensor(weight_handler_, rank_info_, t_name); diff --git a/csrc/core/operator/operator.h b/csrc/core/operator/operator.h index 173157c1..4f07dd64 100644 --- a/csrc/core/operator/operator.h +++ b/csrc/core/operator/operator.h @@ -110,7 +110,6 @@ class AsOperator { // for LoRA only: std::vector weight_names_; // currently only used by GemmLora - std::unique_ptr fake_weight_; // currently only used by GemmLora std::set tainted_lora_names_; // lora一旦被卸载过,就标记为tainted,因为有可能用户修改权重后以同样的名字重新加载 diff --git a/csrc/core/tensor/tensor.h b/csrc/core/tensor/tensor.h index 8ea4a041..1ab9361d 100644 --- a/csrc/core/tensor/tensor.h +++ b/csrc/core/tensor/tensor.h @@ -352,6 +352,15 @@ class TensorUtils { std::vector>& src_arr, const DeviceContext* ctx = nullptr); + static void ConcatMatrix2DColWiseBatched( + AsTensor& dst, std::vector>& src_arr, + const DeviceContext* ctx = nullptr); + + static void ConcatMatrix2DColWiseBatchedRawPtr( + AsTensor& dst, std::vector& src_ptr_arr, + DeviceType src_device_type, std::vector& src_shapes, + DataType src_dtype, const DeviceContext* ctx = nullptr); + static void DeepCopyVectorPartAsync(AsTensor& dst, size_t dst_col_offset, const AsTensor& src, size_t src_col_offset, size_t len, diff --git a/csrc/core/tensor/tensor_utils.cpp b/csrc/core/tensor/tensor_utils.cpp index 98dfecf9..d053d398 100644 --- a/csrc/core/tensor/tensor_utils.cpp +++ b/csrc/core/tensor/tensor_utils.cpp @@ -460,8 +460,8 @@ void TensorUtils::DeepCopyMatrix2DPartFromBatch( region_width + dst_col_offset > dst.shape_[1]) { char buf[1024]; sprintf(buf, - "region_height:%ld region_width:%ld src_row_offset:%ld " - "src_col_offset:%ld dst_row_offset:%ld dst_col_offset:%ld " + "region_height:%d region_width:%d src_row_offset:%d " + "src_col_offset:%d dst_row_offset:%d dst_col_offset:%d " "src.shape(%ld,%ld) dst.shape(%ld,%ld)\n", region_height, region_width, src_row_offset, src_col_offset, dst_row_offset, dst_col_offset, src.shape_[1], src.shape_[2], @@ -584,6 +584,81 @@ void TensorUtils::ConcatMatrix2DColWise( "CUDAs"); #endif } + +void TensorUtils::ConcatMatrix2DColWiseBatched( + AsTensor& dst, std::vector>& src_arr, + const DeviceContext* ctx) { +#ifdef ENABLE_CUDA + const auto& dst_shape = dst.GetShape(); + AS_ENFORCE(dst_shape.Size() == 3); + AS_ENFORCE(dst.GetDeviceType() == DeviceType::CUDA); + auto dst_batch_stride = dst_shape.Count(1) * SizeofType(dst.GetDataType()); + char* dst_ptr = (char*)dst.GetDataPtr(); + auto dpitch = dst_shape.Count(2) * SizeofType(dst.GetDataType()); + for (auto i = 0; i < src_arr.size(); i++) { + auto& src = *src_arr.at(i); + AS_ENFORCE(src.GetDeviceType() == DeviceType::CUDA); + cudaMemcpyKind kind = + GetCudaMemcpyKind(src.GetDeviceType(), dst.GetDeviceType()); + const auto& src_shape = src.GetShape(); + AS_ENFORCE(src_shape.Size() == 3); + auto region_height = src_shape[0] * src_shape[1]; + auto region_width_nbytes = src.GetStrideInByte(); + if (ctx) + AS_CHECK_CUDA(cudaMemcpy2DAsync( + dst_ptr, dpitch, src.GetDataPtr(), region_width_nbytes, + region_width_nbytes, region_height, kind, + static_cast(ctx)->GetStream())); + else + AS_CHECK_CUDA(cudaMemcpy2D(dst_ptr, dpitch, src.GetDataPtr(), + region_width_nbytes, region_width_nbytes, + region_height, kind)); + dst_ptr += region_width_nbytes; + } +#else + throw AsException( + "Currently, TensorUtils::ConcatMatrix2DColWise is only supported between " + "CUDAs"); +#endif +} + +void TensorUtils::ConcatMatrix2DColWiseBatchedRawPtr( + AsTensor& dst, std::vector& src_ptr_arr, DeviceType src_device_type, + std::vector& src_shapes, DataType src_dtype, + const DeviceContext* ctx) { +#ifdef ENABLE_CUDA + const auto& dst_shape = dst.GetShape(); + AS_ENFORCE(dst_shape.Size() == 3); + AS_ENFORCE(dst.GetDeviceType() == DeviceType::CUDA); + auto dst_batch_stride = dst_shape.Count(1) * SizeofType(dst.GetDataType()); + char* dst_ptr = (char*)dst.GetDataPtr(); + auto dpitch = dst_shape.Count(2) * SizeofType(dst.GetDataType()); + for (auto i = 0; i < src_ptr_arr.size(); i++) { + void* src_ptr = src_ptr_arr.at(i); + auto& src_shape = src_shapes.at(i); + AS_ENFORCE(src_device_type == DeviceType::CUDA); + cudaMemcpyKind kind = + GetCudaMemcpyKind(src_device_type, dst.GetDeviceType()); + AS_ENFORCE(src_shape.Size() == 3); + auto region_height = src_shape[0] * src_shape[1]; + auto region_width_nbytes = src_shape[-1] * SizeofType(src_dtype); + if (ctx) + AS_CHECK_CUDA( + cudaMemcpy2DAsync(dst_ptr, dpitch, src_ptr, region_width_nbytes, + region_width_nbytes, region_height, kind, + static_cast(ctx)->GetStream())); + else + AS_CHECK_CUDA(cudaMemcpy2D(dst_ptr, dpitch, src_ptr, region_width_nbytes, + region_width_nbytes, region_height, kind)); + dst_ptr += region_width_nbytes; + } +#else + throw AsException( + "Currently, TensorUtils::ConcatMatrix2DColWise is only supported between " + "CUDAs"); +#endif +} + std::shared_ptr TensorUtils::DeepCopyDLTensorMapToTensorMap( std::shared_ptr in_map, const DeviceType target_device_type) { if (!in_map) return nullptr; diff --git a/csrc/device/cpu/cpu_context.h b/csrc/device/cpu/cpu_context.h index 9c9a6936..4cb13487 100644 --- a/csrc/device/cpu/cpu_context.h +++ b/csrc/device/cpu/cpu_context.h @@ -35,7 +35,9 @@ class DNNLEngine { class CPUContext : public DeviceContext { public: - CPUContext() : cpu_id_(0), stream_(DNNLEngine::GetInstance().GetEngine()) { + CPUContext() : cpu_id_(0), stream_(DNNLEngine::GetInstance().GetEngine()) {} + + virtual void Init() override { int nthread = cpu::get_max_threads(); SetNumThreads(nthread); } diff --git a/csrc/device/cuda/cuda_cache_allocator.cpp b/csrc/device/cuda/cuda_cache_allocator.cpp index ccdc221b..d835a58c 100644 --- a/csrc/device/cuda/cuda_cache_allocator.cpp +++ b/csrc/device/cuda/cuda_cache_allocator.cpp @@ -12,7 +12,7 @@ #include "device/bfc_allocator.h" #include "utility/check.h" -#define NVML_USE_V2_API +// #define NVML_USE_V2_API namespace allspark { @@ -249,4 +249,4 @@ int64_t CudaCacheAllocator::GetDeviceUsedMemory() const { #undef CHECK_NVML_RET #undef CHECK_NVML -} // namespace allspark +} // namespace allspark \ No newline at end of file diff --git a/csrc/device/cuda/cuda_context.cpp b/csrc/device/cuda/cuda_context.cpp index a1dabba5..ace650eb 100644 --- a/csrc/device/cuda/cuda_context.cpp +++ b/csrc/device/cuda/cuda_context.cpp @@ -21,6 +21,8 @@ namespace allspark { +thread_local int CUDAContext::last_device_id_of_this_thread_ = -1; + CUDAContext::~CUDAContext() { try { if (hiednn_handle_) { @@ -44,11 +46,22 @@ CUDAContext::~CUDAContext() { // avoid the gcc warnning. LOG(ERROR) << "Exception in destroy cuda context."; } + + // reset the thread local value, it's a static value. + last_device_id_of_this_thread_ = -1; } void CUDAContext::SetDeviceId(int device_id) { DLOG(INFO) << "CUDAContext::SetDeviceId()" << device_id << std::endl; + LOG(INFO) << "local id : " << last_device_id_of_this_thread_ + << " setting value: " << device_id; + if (last_device_id_of_this_thread_ == device_id) { + LOG(INFO) << " CUDAContext::by pass thread id setting, since last thread " + "have same value"; + return; + } + // if have old handler, destory the device id and handler. if (hiednn_handle_) { AS_CHECK_CUDA(cudaSetDevice(device_id_)); @@ -87,6 +100,7 @@ void CUDAContext::SetDeviceId(int device_id) { cslt_handle_initialized_ = true; } #endif + last_device_id_of_this_thread_ = device_id_; } void CUDAContext::SetSparsityMatmulMode(bool enable_sparsity_matmul) { diff --git a/csrc/device/cuda/cuda_context.h b/csrc/device/cuda/cuda_context.h index 9fb32cd7..57caac85 100644 --- a/csrc/device/cuda/cuda_context.h +++ b/csrc/device/cuda/cuda_context.h @@ -52,6 +52,8 @@ class CUDAContext : public DeviceContext { virtual ~CUDAContext() override; + virtual void Init() override{}; + DeviceType GetDeviceType() const override { return DeviceType::CUDA; } void SetNumThreads(int num_threads) override { @@ -115,6 +117,7 @@ class CUDAContext : public DeviceContext { private: int device_id_ = 0; + thread_local static int last_device_id_of_this_thread_; cudaStream_t stream_; cublasHandle_t cublas_handle_; cublasLtHandle_t cublaslt_handle_; diff --git a/csrc/interface/allspark.h b/csrc/interface/allspark.h index 448b1456..b1bf8d64 100644 --- a/csrc/interface/allspark.h +++ b/csrc/interface/allspark.h @@ -148,7 +148,9 @@ struct GenerateConfig { std::map vocab; /// model tokenizer vocab VocabType vocab_type; /// model tokenizer type // deprecated options - + std::string uuid = + "Default-UUID(deprecated)"; ///< deprecated option, will filled by engine + ///< internally. std::string user_request_id = "Default-User-UUID"; /// This uuid should only use for logging. @@ -157,21 +159,25 @@ struct GenerateConfig { class AsModelConfig { public: - static const int default_engine_max_length = 2048; - static const int default_engine_max_batch = 32; - static const int default_engine_max_prefill_length = 0; - static const int default_swap_threshold = -1; - static const int default_num_thread = 0; + static constexpr int default_engine_max_length = 2048; + static constexpr int default_engine_max_batch = 32; + static constexpr int default_engine_max_prefill_length = 0; + static constexpr int default_swap_threshold = -1; + static constexpr int default_num_thread = 0; static constexpr const char* default_matmul_precision = "highest"; static constexpr const char* default_compute_unit = "CUDA:0"; - static const int default_span_size = 32; - static const AsMHAPrefill default_prefill_mode = + static constexpr int default_span_size = 32; + static constexpr AsMHAPrefill default_prefill_mode = AsMHAPrefill::AsPrefillDefault; - static const AsCacheMode default_kv_cache_mode = AsCacheMode::AsCacheDefault; - static const AsEvictionStrategy default_eviction_strategy = + static constexpr AsCacheMode default_kv_cache_mode = + AsCacheMode::AsCacheDefault; + static constexpr AsEvictionStrategy default_eviction_strategy = AsEvictionStrategy::MaxLength; - static const AsSchedulingStrategy default_scheduling_strategy = + static constexpr AsSchedulingStrategy default_scheduling_strategy = AsSchedulingStrategy::ContextPriority; + static constexpr int default_lora_max_rank = 64; + static constexpr int default_lora_max_num = 5; + AsModelConfig(); AsModelConfig( std::string in_model_name, std::string in_model_path, @@ -190,7 +196,9 @@ class AsModelConfig { AsCacheMode in_cache_mode = default_kv_cache_mode, AsEvictionStrategy in_eviction_strategy = default_eviction_strategy, AsSchedulingStrategy in_scheduling_strategy = default_scheduling_strategy, - bool enable_sparsity_matmul = false); + bool enable_sparsity_matmul = false, + int lora_max_rank = default_lora_max_rank, + int lora_max_num = default_lora_max_num); bool operator==(const AsModelConfig& other) const { return model_name == other.model_name && model_path == other.model_path && @@ -202,8 +210,7 @@ class AsModelConfig { num_threads == other.num_threads && matmul_precision == other.matmul_precision && swap_threshold == other.swap_threshold && - text_graph == other.text_graph && - is_lora == other.is_lora && // TODO: compare lora_names ??? + text_graph == other.text_graph && is_lora_cfg == other.is_lora_cfg && cache_span_size == other.cache_span_size && cache_span_num_init == other.cache_span_num_init && cache_span_num_grow == other.cache_span_num_grow && @@ -212,7 +219,9 @@ class AsModelConfig { prefix_cache_ttl == other.prefix_cache_ttl && prefill_mode == other.prefill_mode && eviction_strategy == other.eviction_strategy && - enable_sparsity_matmul == other.enable_sparsity_matmul; + enable_sparsity_matmul == other.enable_sparsity_matmul && + lora_max_rank == other.lora_max_rank && + lora_max_num == other.lora_max_num; } std::string ToString() const; // detail see as_engine.cpp @@ -224,7 +233,7 @@ class AsModelConfig { std::string compute_unit = default_compute_unit; std::string matmul_precision = default_matmul_precision; int num_threads = default_num_thread; - bool is_lora = false; + bool is_lora_cfg = false; // 表示该配置的类型,是for基模的,还是for LoRA的 int64_t swap_threshold = default_swap_threshold; // -1: 不swap 0: swap全部tensors @@ -232,10 +241,9 @@ class AsModelConfig { int engine_max_length = default_engine_max_length; int engine_max_batch = default_engine_max_batch; int engine_max_prefill_length = default_engine_max_prefill_length; - std::vector lora_names; - int cache_span_size = default_span_size; // deprecated - int cache_span_num_init = 0; // deprecated - int cache_span_num_grow = 0; // deprecated + int cache_span_size = default_span_size; + int cache_span_num_init = 0; // deprecated + int cache_span_num_grow = 0; // deprecated bool enable_prefix_cache = true; int prefix_cache_ttl = 300; AsCacheMode cache_mode = default_kv_cache_mode; @@ -244,6 +252,9 @@ class AsModelConfig { AsSchedulingStrategy scheduling_strategy = default_scheduling_strategy; bool text_graph = false; bool enable_sparsity_matmul = false; + std::vector lora_names; // deprecated, not used any longer + int lora_max_rank = default_lora_max_rank; + int lora_max_num = default_lora_max_num; }; /** diff --git a/csrc/interface/allspark_check.h b/csrc/interface/allspark_check.h index 90b328d6..9d393f42 100644 --- a/csrc/interface/allspark_check.h +++ b/csrc/interface/allspark_check.h @@ -71,11 +71,17 @@ enum class AsStatus : int { ALLSPARK_EMPTY_REQUEST = 9, // 无有效请求 ALLSPARK_ILLEGAL_REQUEST_ID = 10, // 没有找到有效的request_id ALLSPARK_CACHE_MEMORY_OUT = 11, - ALLSPARK_REQUEST_DENIED = 12, // 停服务状态,拒绝服务 - ALLSPARK_LORA_NOT_LOADED = 13, // 用户指定的lora没加载 - ALLSPARK_CHUNK_PREFILL = 14, // 长请求首包分割进行 - ALLSPARK_DEPRECATED = 20, // 触发过时接口 - ALLSPARK_STREAMING = 200, // 流式返回 + ALLSPARK_REQUEST_DENIED = 12, // 停服务状态,拒绝服务 + // ALLSPARK_LORA_NOT_LOADED = 13, // 用户指定的lora没加载. + // 已经停用,ALLSPARK_LORA_NOT_FOUND 代替 + ALLSPARK_CHUNK_PREFILL = 14, // 长请求首包分割进行 + ALLSPARK_DEPRECATED = 20, // 触发过时接口 + ALLSPARK_LORA_NUM_EXCEED_LIMIT_ERROR = 21, // lora数量超限 + ALLSPARK_LORA_RANK_EXCEED_LIMIT_ERROR = 22, // lora rank超限 + ALLSPARK_LORA_NOT_FOUND = 23, // lora 没找到 + ALLSPARK_LORA_ALREADY_LOADED = 24, // lora 已加载 + ALLSPARK_LORA_IN_USE = 25, // lora 使用中 + ALLSPARK_STREAMING = 200, // 流式返回 }; const std::string AsGetErrorByCode(AsStatus error_code); diff --git a/csrc/proto/allspark.proto b/csrc/proto/allspark.proto index effab414..0fd2976b 100755 --- a/csrc/proto/allspark.proto +++ b/csrc/proto/allspark.proto @@ -104,6 +104,7 @@ message ConfigProto { int32 hidden_size = 19; int32 num_experts = 20; int32 num_experts_per_tok = 21; + int32 intermediate_size = 22; } message BuildVersion { diff --git a/csrc/proto/allspark_service.proto b/csrc/proto/allspark_service.proto index 2d8352e8..74588992 100644 --- a/csrc/proto/allspark_service.proto +++ b/csrc/proto/allspark_service.proto @@ -90,7 +90,7 @@ enum AS_STATUS { ALLSPARK_ILLEGAL_REQUEST_ID = 10; //没有找到有效的request_id ALLSPARK_CACHE_MEMORY_OUT = 11; ALLSPARK_REQUEST_DENIED = 12; // 停服务状态,拒绝服务 - ALLSPARK_LORA_NOT_LOADED = 13; // 用户指定的lora没加载 + //ALLSPARK_LORA_NOT_LOADED = 13; // 用户指定的lora没加载, 已经停用 ALLSPARK_STREAMING = 200; // 流式返回 } @@ -213,7 +213,7 @@ message ModelStructConfig { string weights_path = 3; string compute_unit = 4; string matmul_precision = 5; - bool is_lora = 6; + bool is_lora_cfg = 6; int64 swap_threshold = 7; int32 engine_max_length = 8; int32 engine_max_batch = 9; diff --git a/csrc/runtime/cache/prefix_cache_manager_private.cpp b/csrc/runtime/cache/prefix_cache_manager_private.cpp index 6fac2a88..bf8117f2 100644 --- a/csrc/runtime/cache/prefix_cache_manager_private.cpp +++ b/csrc/runtime/cache/prefix_cache_manager_private.cpp @@ -47,8 +47,8 @@ void PrefixCacheManager::init_cpu_union(const int nranks, size_t cache_mem_bytes = (size_t)(available_memory * ratio); size_t frame_size = gpu_union_->span_manager->GetSpanSize(); size_t frame_per_node = - kv_cnt * - layer_num_; // number of frames needed per FIXED_SPAN_SIZE tokens + kv_cnt * layer_num_; // number of frames needed per + // SpanCacheConfig::span_size tokens size_t max_frame_num = 0; int capacity = 0; diff --git a/csrc/runtime/state/model_control_state.cpp b/csrc/runtime/state/model_control_state.cpp index 64b0d323..3489ff4d 100644 --- a/csrc/runtime/state/model_control_state.cpp +++ b/csrc/runtime/state/model_control_state.cpp @@ -2,10 +2,6 @@ * Copyright (c) Alibaba, Inc. and its affiliates. * @file model_control_state.cpp */ -// -// Created by jiejing.zjj on 4/28/24. -// - #include #include @@ -13,11 +9,10 @@ namespace allspark { void ModelControlState::StopLoop() { LOG(INFO) << "[" << model_name << " ] " << " Model Loop going to stop..."; - std::unique_lock lock(*(this->lock)); if (loop_thread_) { loop_thread_->join(); loop_thread_.reset(); model_stopped = true; } } -} // namespace allspark \ No newline at end of file +} // namespace allspark diff --git a/csrc/runtime/weight/weight_manager.cpp b/csrc/runtime/weight/weight_manager.cpp index 403f7dec..b17f87c3 100644 --- a/csrc/runtime/weight/weight_manager.cpp +++ b/csrc/runtime/weight/weight_manager.cpp @@ -159,6 +159,16 @@ AsStatus WeightManagerImpl::LoadWeightForModel( weight_info.size_bytes = SizeofType(info.dtype) * info.shape.Count(); weight_info.weight_offset = ftell(fp); weight_info.info = info; + + // before proceeding, we need validate the weight according some rules + AsStatus ret = + ValidateWeight(weight_handler, weight_info, target_device_ctx); + if (ret != AsStatus::ALLSPARK_SUCCESS) { + LOG(ERROR) << "ValidateWeight failed for tensor " << weight_info.name + << ", fail reason=" << int(ret); + return ret; + } + int fd = fileno(fp); struct stat sb; fstat(fd, &sb); @@ -275,7 +285,7 @@ AsStatus WeightManagerImpl::LoadWeightForModel( // Note: because some aglinment op will modify weight's size, so we can // only store the tensor tensor after load, otherwise some shape logic // will be wrong. - if (weight_handler->GetModelConfig().is_lora) { + if (weight_handler->GetModelConfig().is_lora_cfg) { swap_status_[weight_handler][rank_info] = SwapStatus::SwapInit; LOG(INFO) << "finish load lora " << weight_handler->GetModelConfig().model_name << " for rank " @@ -323,7 +333,6 @@ std::shared_ptr WeightManagerImpl::GetWeightTensor( std::shared_ptr& handler, RankInfo& rank_info, const std::string& name) { rw_read_lock lk(lock_, "GetWeightTensor"); - if (!handler_is_avalibile(handler) || !weight_on_rank_is_avalibile(handler, rank_info)) { LOG(ERROR) << "Try to find weight for non exist rank or handler " @@ -353,7 +362,6 @@ std::shared_ptr WeightManagerImpl::GetWeightTensor( // DLOG(INFO) << "Weight MD5: " << name << " " // << weight_map->at(name)->ToString(); #endif - return weight_map->at(name); } diff --git a/csrc/runtime/weight/weight_manager.h b/csrc/runtime/weight/weight_manager.h index 27ceae76..1ed59688 100644 --- a/csrc/runtime/weight/weight_manager.h +++ b/csrc/runtime/weight/weight_manager.h @@ -20,7 +20,6 @@ namespace allspark { class RankInfo; class TransformerProto; -// class ModelWeightHandler; enum class SwapStatus { SwapInit = 0, // initial state. @@ -185,6 +184,12 @@ class WeightManager { std::shared_ptr weight_handler); virtual void RegisterWeightEventListener(WeightEventCallback callback); + virtual AsStatus ValidateWeight( + std::shared_ptr& weight_handler, + const ModelWeightAccessInfo& weight_info, + const DeviceContext& device_ctx) { + return AsStatus::ALLSPARK_SUCCESS; + } protected: WeightManager(); diff --git a/csrc/runtime/weight/weight_manager_lora.cpp b/csrc/runtime/weight/weight_manager_lora.cpp index 2208adc9..7bf10861 100644 --- a/csrc/runtime/weight/weight_manager_lora.cpp +++ b/csrc/runtime/weight/weight_manager_lora.cpp @@ -5,53 +5,54 @@ #include "weight_manager_lora.h" +#include + namespace allspark { -std::shared_ptr LoraManager::Create(const RankInfo& rank_info) { - return std::make_shared(rank_info); +std::shared_ptr LoraManager::Create(const int lora_max_num, + const RankInfo& rank_info) { + return std::make_shared(lora_max_num, rank_info); } // currently, only support 1 base-model for each LoraManager std::shared_ptr& LoraManager::RegisterLora( const AsModelConfig& lora_config) { - DLOG(INFO) << "enter LoraManager::RegisterLora " << lora_config.model_name; rw_write_lock lk(lora_lock_, "RegisterLora"); // 每个AsModel有自己的LoraManager, // 无需锁 - size_t new_id = weight_handler_store_.size(); + // 寻找空闲slot + size_t i = 0; + for (i = 0; i < lora_max_num_; i++) { + if (weight_handler_store_[i] == nullptr) break; + } + size_t new_id = i; + AS_ENFORCE(new_id < lora_max_num_); auto fake_proto = std::make_shared(); - weight_handler_store_.emplace_back(std::make_shared( + auto weight_handle_ptr = std::make_shared( new_id, lora_config, - fake_proto)); // nullptr: lora只有权重,没有对应的子图 + fake_proto); // nullptr: lora只有权重,没有对应的子图 + weight_handler_store_[new_id] = weight_handle_ptr; // 复用 handle slot lora_name_idx_map_[lora_config.model_name] = new_id; - LOG(INFO) << "LoraManager::RegisterLora " << lora_config.model_name << " " - << weight_handler_store_.back() << " done!"; - return weight_handler_store_.back(); + DLOG(INFO) << "LoraManager::RegisterLora " << lora_config.model_name + << ", handle ptr=" << weight_handler_store_[new_id].get() + << " done!"; + return weight_handler_store_[new_id]; } void LoraManager::UnRegisterLora(const std::string& lora_name) { rw_write_lock lk(lora_lock_, "UnRegisterLora"); // 每个AsModel有自己的LoraManager, // 无需锁 - DLOG(INFO) << "enter LoraManager::UnRegisterLora " << lora_name; auto idx = lora_name_idx_map_.at(lora_name); auto& lora_weight_handle = weight_handler_store_.at(idx); - SwapOutWeight(lora_weight_handle, rank_info_); // free cuda mem + AS_ENFORCE(handler_is_avalibile(lora_weight_handle)); + LOG(INFO) << "UnRegisterLora " << lora_name + << " handle id=" << lora_weight_handle->GetId(); + weight_storage_[lora_weight_handle].erase(rank_info_); // 释放cuda mem + // 让该lora handle彻底消失 + weight_storage_.erase(lora_weight_handle); weight_handler_store_[idx] = nullptr; lora_name_idx_map_.erase(lora_name); - - // 因为weight_handler_store_设计成vector, 删掉某个lora后, - // weight_handler_store_概念上会产生"空洞",需要重新compact: compact - // weight_handler_store_ - std::vector> tmp_weight_handler_store; - for (auto& item : lora_name_idx_map_) { - auto& lora_name = item.first; - auto& idx = item.second; - tmp_weight_handler_store.emplace_back(weight_handler_store_[idx]); - idx = tmp_weight_handler_store.size() - 1; - } - // std::swap(weight_handler_store_, tmp_weight_handler_store); - weight_handler_store_ = tmp_weight_handler_store; LOG(INFO) << "LoraManager::UnRegisterLora " << lora_name << " done!"; } @@ -59,6 +60,7 @@ std::shared_ptr LoraManager::GetLoraTensorByName( const std::string& lora_name, const std::string& tensor_name) { rw_write_lock lk(lora_lock_, "GetLoraTensorByName"); // 每个AsModel有自己的LoraManager, + AS_ENFORCE(lora_name_idx_map_.count(lora_name) > 0); auto& lora_weight_handle = weight_handler_store_.at(lora_name_idx_map_.at(lora_name)); return GetWeightTensor(lora_weight_handle, rank_info_, tensor_name); @@ -87,11 +89,9 @@ bool LoraManager::HasLoraBias(const std::string& lora_name, const std::string& lora_weight_name) { rw_write_lock lk(lora_lock_, "HasLoraBias"); // 每个AsModel有自己的LoraManager, - DLOG(INFO) << "enter LoraManager::HasLoraBias " << lora_name << " " - << lora_weight_name; + AS_ENFORCE(lora_name_idx_map_.count(lora_name) > 0); auto& lora_weight_handle = weight_handler_store_.at(lora_name_idx_map_.at(lora_name)); - DLOG(INFO) << "gethandler:" << lora_weight_handle; std::string lora_bias_name = lora_weight_name; std::string weight_suffix = ".weight"; std::string bias_suffix = ".bias"; @@ -127,18 +127,48 @@ std::shared_ptr LoraManager::GetHandleByName( const std::string& lora_name) { rw_write_lock lk(lora_lock_, "GetHandleByName"); // 每个AsModel有自己的LoraManager, + AS_ENFORCE(lora_name_idx_map_.count(lora_name) > 0); return weight_handler_store_.at(lora_name_idx_map_.at(lora_name)); } +AsStatus LoraManager::ValidateWeight( + std::shared_ptr& weight_handler, + const ModelWeightAccessInfo& weight_info, const DeviceContext& device_ctx) { + auto inference_dtype = device_ctx.GetDtype(); + auto lora_dtype = weight_info.info.dtype; + if (lora_dtype != inference_dtype) { + LOG(ERROR) << "lora " << weight_handler->GetModelConfig().model_name + << " dtype mismatch!" + << " dtype for inference is " << DataType_Name(inference_dtype) + << " lora's dtype is " << DataType_Name(lora_dtype); + return AsStatus::ALLSPARK_PARAM_ERROR; + } + if (weight_info.name.find(".lora_B") != std::string::npos && + weight_info.info.shape[0] > + weight_handler->GetModelConfig().lora_max_rank) + return AsStatus::ALLSPARK_LORA_RANK_EXCEED_LIMIT_ERROR; + return AsStatus::ALLSPARK_SUCCESS; +} + void LoraManager::PrintLoras() { LOG(INFO) << "loraStorage size=" << weight_handler_store_.size() << " " << lora_name_idx_map_.size(); + LOG(INFO) << "loras in map:"; for (auto& item : lora_name_idx_map_) { LOG(INFO) << item.first << " : " << item.second << ", " - << weight_handler_store_.at(item.second)->GetModelConfig().model_name; + << weight_handler_store_.at(item.second)->GetModelConfig().model_name + << ", addr=" << &(weight_handler_store_.at(item.second)) + << ", ptr=" << weight_handler_store_.at(item.second).get(); } + LOG(INFO) << "loras in vector:"; + for (int i = 0; i < weight_handler_store_.size(); i++) { + if (weight_handler_store_.at(i) == nullptr) continue; + LOG(INFO) << i << " : " + << weight_handler_store_.at(i)->GetModelConfig().model_name; + } + LOG(INFO) << "-----lora print done"; } -} // namespace allspark \ No newline at end of file +} // namespace allspark diff --git a/csrc/runtime/weight/weight_manager_lora.h b/csrc/runtime/weight/weight_manager_lora.h index c942c8e2..ea40db84 100644 --- a/csrc/runtime/weight/weight_manager_lora.h +++ b/csrc/runtime/weight/weight_manager_lora.h @@ -11,8 +11,24 @@ namespace allspark { class LoraManager : public WeightManagerImpl { public: LoraManager() = delete; - LoraManager(const RankInfo& rank_info) : rank_info_(rank_info) {} - static std::shared_ptr Create(const RankInfo& rank_info); + LoraManager(const int lora_max_num, const RankInfo& rank_info) + : lora_max_num_(lora_max_num), rank_info_(rank_info) { + weight_handler_store_.resize(lora_max_num_, nullptr); + } + + // 为了安全,lora + // 禁止使用swap功能。必须由外部调用者来显式地load_lora和unload_lora + void SwapInWeight(std::shared_ptr& handler, + RankInfo info) { + throw AsException("ALLSPARK_INVALID_CALL"); + } + void SwapOutWeight(std::shared_ptr& handler, + RankInfo info) { + throw AsException("ALLSPARK_INVALID_CALL"); + } + + static std::shared_ptr Create(const int lora_max_num, + const RankInfo& rank_info); std::shared_ptr& RegisterLora( const AsModelConfig& lora_config); void UnRegisterLora(const std::string& lora_name); @@ -26,6 +42,7 @@ class LoraManager : public WeightManagerImpl { const std::string& lora_weight_name); std::shared_ptr GetHandleByName( const std::string& lora_name); + int GetNumLoras() const { return lora_name_idx_map_.size(); } bool IsEmpty() const { return lora_name_idx_map_.size() == 0; } bool IsLoraExists(const std::string& lora_name) const { return lora_name_idx_map_.count(lora_name) != 0; @@ -34,13 +51,17 @@ class LoraManager : public WeightManagerImpl { const std::unordered_map& GetLoraNameIdxMap() { return lora_name_idx_map_; } - + virtual AsStatus ValidateWeight( + std::shared_ptr& weight_handler, + const ModelWeightAccessInfo& weight_info, + const DeviceContext& device_ctx); void PrintLoras(); private: std::unordered_map lora_name_idx_map_; mutable std::shared_timed_mutex lora_lock_; RankInfo rank_info_; // 与AsModel的RankInfo保持一致 + int lora_max_num_; }; } // namespace allspark diff --git a/csrc/service/allspark_client.cpp b/csrc/service/allspark_client.cpp index d96acb75..9b2dbd63 100644 --- a/csrc/service/allspark_client.cpp +++ b/csrc/service/allspark_client.cpp @@ -27,7 +27,8 @@ AsModelConfig::AsModelConfig( bool enable_prefix_cache, int prefix_cache_ttl, AsMHAPrefill in_prefill_mode, AsCacheMode in_cache_mode, AsEvictionStrategy in_eviction_strategy, - AsSchedulingStrategy in_scheduling_strategy, bool enable_sparsity_matmul) + AsSchedulingStrategy in_scheduling_strategy, bool enable_sparsity_matmul, + int lora_max_rank, int lora_max_num) : model_name(std::move(in_model_name)), model_path(std::move(in_model_path)), weights_path(std::move(in_weights_path)), @@ -48,6 +49,8 @@ AsModelConfig::AsModelConfig( prefill_mode(in_prefill_mode), eviction_strategy(in_eviction_strategy), enable_sparsity_matmul(enable_sparsity_matmul), + lora_max_rank(lora_max_rank), + lora_max_num(lora_max_num), text_graph(in_text_graph) {} static std::vector g_errors; @@ -176,12 +179,12 @@ AsStatus AsClientEngine::SyncRequest(const char* model_name, AsStatus AsClientEngine::LoadLoraByName(const char* model_name, const char* lora_name) { - return AsStatus::ALLSPARK_LORA_NOT_LOADED; + return AsStatus::ALLSPARK_LORA_NOT_FOUND; } AsStatus AsClientEngine::UnloadLoraByName(const char* model_name, const char* lora_name) { - return AsStatus::ALLSPARK_LORA_NOT_LOADED; + return AsStatus::ALLSPARK_LORA_NOT_FOUND; } AsEngineStat AsClientEngine::GetAsEngineStat(const char* model_name) { diff --git a/csrc/service/allspark_client_impl.cpp b/csrc/service/allspark_client_impl.cpp index c5f41db4..9e1e7368 100644 --- a/csrc/service/allspark_client_impl.cpp +++ b/csrc/service/allspark_client_impl.cpp @@ -6,13 +6,13 @@ #include "allspark_client_impl.h" #include +#include #include #include #include #include #include -#include #include "allspark_service_helper.h" #include "allspark_service_parallel.h" @@ -578,4 +578,4 @@ AsClientContext::AsClientContext() : context_size_(0) { lauch_success_ = LaunchService() == AsStatus::ALLSPARK_SUCCESS; } AsClientContext::~AsClientContext() { ShutdownService(); } -} // namespace allspark +} // namespace allspark \ No newline at end of file diff --git a/csrc/service/allspark_service_helper.h b/csrc/service/allspark_service_helper.h index 48c5cacb..7867040f 100644 --- a/csrc/service/allspark_service_helper.h +++ b/csrc/service/allspark_service_helper.h @@ -292,9 +292,7 @@ void makeInputCfgAsFromProto( PROTO_CONFIG(gen_cfg_proto, max_length, gen_cfg); PROTO_CONFIG(gen_cfg_proto, no_repeat_ngram_size, gen_cfg); PROTO_CONFIG(gen_cfg_proto, eos_token_id, gen_cfg); - gen_cfg.user_request_id = gen_cfg_proto.uuid(); - PROTO_CONFIG(gen_cfg_proto, presence_penalty, gen_cfg); PROTO_CONFIG(gen_cfg_proto, suppress_repetition_in_generation, gen_cfg); PROTO_CONFIG(gen_cfg_proto, input_len, gen_cfg); @@ -350,7 +348,7 @@ void makeModelStructConfigProtoFromAs( model_struct_proto.set_weights_path(model_config.weights_path); model_struct_proto.set_compute_unit(model_config.compute_unit); model_struct_proto.set_matmul_precision(model_config.matmul_precision); - model_struct_proto.set_is_lora(model_config.is_lora); + model_struct_proto.set_is_lora_cfg(model_config.is_lora_cfg); model_struct_proto.set_swap_threshold(model_config.swap_threshold); model_struct_proto.set_engine_max_length(model_config.engine_max_length); model_struct_proto.set_engine_max_batch(model_config.engine_max_batch); @@ -440,10 +438,7 @@ void makeGenCfgProtoFromAs(allspark_service::GenerateConfig& gen_cfg_proto, PROTO_CONFIG(gen_cfg_proto, max_length, gen_cfg); PROTO_CONFIG(gen_cfg_proto, no_repeat_ngram_size, gen_cfg); PROTO_CONFIG(gen_cfg_proto, eos_token_id, gen_cfg); - gen_cfg_proto.set_uuid(gen_cfg.user_request_id); - -// PROTO_CONFIG(gen_cfg_proto, uuid, gen_cfg); PROTO_CONFIG(gen_cfg_proto, presence_penalty, gen_cfg); PROTO_CONFIG(gen_cfg_proto, suppress_repetition_in_generation, gen_cfg); PROTO_CONFIG(gen_cfg_proto, input_len, gen_cfg); @@ -666,4 +661,4 @@ void makeAsEngineStatProtoFromAs( } } // namespace allspark_service -} // namespace allspark \ No newline at end of file +} // namespace allspark diff --git a/csrc/utility/blockingconcurrentqueue.h b/csrc/utility/blockingconcurrentqueue.h new file mode 100644 index 00000000..761f4dae --- /dev/null +++ b/csrc/utility/blockingconcurrentqueue.h @@ -0,0 +1,586 @@ +// Provides an efficient blocking version of moodycamel::ConcurrentQueue. +// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified +// BSD license, available at the top of concurrentqueue.h. +// Also dual-licensed under the Boost Software License (see LICENSE.md) +// Uses Jeff Preshing's semaphore implementation (under the terms of its +// separate zlib license, see lightweightsemaphore.h). + +// clang-format off + +#pragma once + +#include "concurrentqueue.h" +#include "lightweightsemaphore.h" + +#include +#include +#include +#include +#include + +namespace moodycamel +{ +// This is a blocking version of the queue. It has an almost identical interface to +// the normal non-blocking version, with the addition of various wait_dequeue() methods +// and the removal of producer-specific dequeue methods. +template +class BlockingConcurrentQueue +{ +private: + typedef ::moodycamel::ConcurrentQueue ConcurrentQueue; + typedef ::moodycamel::LightweightSemaphore LightweightSemaphore; + +public: + typedef typename ConcurrentQueue::producer_token_t producer_token_t; + typedef typename ConcurrentQueue::consumer_token_t consumer_token_t; + + typedef typename ConcurrentQueue::index_t index_t; + typedef typename ConcurrentQueue::size_t size_t; + typedef typename std::make_signed::type ssize_t; + + static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE; + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD; + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE; + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE; + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE; + static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE; + +public: + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be allocated + // up-front, which means only a single producer will be able to enqueue elements + // without an extra allocation -- blocks aren't shared between producers). + // This method is not thread safe -- it is up to the user to ensure that the + // queue is fully constructed before it starts being used by other threads (this + // includes making the memory effects of construction visible, possibly with a + // memory barrier). + explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE) + : inner(capacity), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) + : inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy) + { + assert(reinterpret_cast((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member"); + if (!sema) { + MOODYCAMEL_THROW(std::bad_alloc()); + } + } + + // Disable copying and copy assignment + BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + : inner(std::move(other.inner)), sema(std::move(other.sema)) + { } + + inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT + { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT + { + swap_internal(other); + } + +private: + BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other) + { + if (this == &other) { + return *this; + } + + inner.swap(other.inner); + sema.swap(other.sema); + return *this; + } + +public: + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const& item) + { + if ((details::likely)(inner.enqueue(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, + // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T&& item) + { + if ((details::likely)(inner.enqueue(std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T const& item) + { + if ((details::likely)(inner.enqueue(token, item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const& token, T&& item) + { + if ((details::likely)(inner.enqueue(token, std::move(item)))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if ((details::likely)(inner.enqueue_bulk(token, std::forward(itemFirst), count))) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const& item) + { + if (inner.try_enqueue(item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T&& item) + { + if (inner.try_enqueue(std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T const& item) + { + if (inner.try_enqueue(token, item)) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues a single item (by moving it, if possible) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const& token, T&& item) + { + if (inner.try_enqueue(token, std::move(item))) { + sema->signal(); + return true; + } + return false; + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) + { + if (inner.try_enqueue_bulk(token, std::forward(itemFirst), count)) { + sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count); + return true; + } + return false; + } + + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue(consumer_token_t& token, U& item) + { + if (sema->tryWait()) { + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + + + // Blocks the current thread until there's something to dequeue, then + // dequeues it. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(item, std::chrono::duration_cast(timeout).count()); + } + + // Blocks the current thread until there's something to dequeue, then + // dequeues it using an explicit consumer token. + // Never allocates. Thread-safe. + template + inline void wait_dequeue(consumer_token_t& token, U& item) + { + while (!sema->wait()) { + continue; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout (specified in microseconds) expires. Returns false + // without setting `item` if the timeout expires, otherwise assigns + // to `item` and returns true. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs) + { + if (!sema->wait(timeout_usecs)) { + return false; + } + while (!inner.try_dequeue(token, item)) { + continue; + } + return true; + } + + // Blocks the current thread until either there's something to dequeue + // or the timeout expires. Returns false without setting `item` if the + // timeout expires, otherwise assigns to `item` and returns true. + // Never allocates. Thread-safe. + template + inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration const& timeout) + { + return wait_dequeue_timed(token, item, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which will + // always be at least one (this method blocks until the queue + // is non-empty) and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Using a negative timeout indicates an indefinite timeout, + // and is thus functionally equivalent to calling wait_dequeue_bulk. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs) + { + size_t count = 0; + max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs); + while (count != max) { + count += inner.template try_dequeue_bulk(token, itemFirst, max - count); + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit consumer token. + // Returns the number of items actually dequeued, which can + // be 0 if the timeout expires while waiting for elements, + // and at most max. + // Never allocates. Thread-safe. + template + inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration const& timeout) + { + return wait_dequeue_bulk_timed(token, itemFirst, max, std::chrono::duration_cast(timeout).count()); + } + + + // Returns an estimate of the total number of elements currently in the queue. This + // estimate is only accurate if the queue has completely stabilized before it is called + // (i.e. all enqueue and dequeue operations have completed and their memory effects are + // visible on the calling thread, and no further operations start while this method is + // being called). + // Thread-safe. + inline size_t size_approx() const + { + return (size_t)sema->availableApprox(); + } + + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static constexpr bool is_lock_free() + { + return ConcurrentQueue::is_lock_free(); + } + + +private: + template + static inline U* create(A1&& a1, A2&& a2) + { + void* p = (Traits::malloc)(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1), std::forward(a2)) : nullptr; + } + + template + static inline void destroy(U* p) + { + if (p != nullptr) { + p->~U(); + } + (Traits::free)(p); + } + +private: + ConcurrentQueue inner; + std::unique_ptr sema; +}; + + +template +inline void swap(BlockingConcurrentQueue& a, BlockingConcurrentQueue& b) MOODYCAMEL_NOEXCEPT +{ + a.swap(b); +} + +} // end namespace moodycamel + +// clang-format on diff --git a/csrc/utility/lightweightsemaphore.h b/csrc/utility/lightweightsemaphore.h new file mode 100644 index 00000000..15266410 --- /dev/null +++ b/csrc/utility/lightweightsemaphore.h @@ -0,0 +1,432 @@ +// clang-format off + +// Provides an efficient implementation of a semaphore (LightweightSemaphore). +// This is an extension of Jeff Preshing's sempahore implementation (licensed +// under the terms of its separate zlib license) that has been adapted and +// extended by Cameron Desrochers. + + +#pragma once + +#include // For std::size_t +#include +#include // For std::make_signed + +#if defined(_WIN32) +// Avoid including windows.h in a header; we only need a handful of +// items, so we'll redeclare them here (this is relatively safe since +// the API generally has to remain stable between Windows versions). +// I know this is an ugly hack but it still beats polluting the global +// namespace with thousands of generic names or adding a .cpp for nothing. +extern "C" { + struct _SECURITY_ATTRIBUTES; + __declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName); + __declspec(dllimport) int __stdcall CloseHandle(void* hObject); + __declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds); + __declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount); +} +#elif defined(__MACH__) +#include +#elif defined(__MVS__) +#include +#elif defined(__unix__) +#include + +#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE) +#if __GLIBC_PREREQ(2,30) +#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC +#endif +#endif +#endif + +namespace moodycamel +{ +namespace details +{ + +// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's +// portable + lightweight semaphore implementations, originally from +// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h +// LICENSE: +// Copyright (c) 2015 Jeff Preshing +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgement in the product documentation would be +// appreciated but is not required. +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original software. +// 3. This notice may not be removed or altered from any source distribution. +#if defined(_WIN32) +class Semaphore +{ +private: + void* m_hSema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + const long maxLong = 0x7fffffff; + m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr); + assert(m_hSema); + } + + ~Semaphore() + { + CloseHandle(m_hSema); + } + + bool wait() + { + const unsigned long infinite = 0xffffffff; + return WaitForSingleObject(m_hSema, infinite) == 0; + } + + bool try_wait() + { + return WaitForSingleObject(m_hSema, 0) == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0; + } + + void signal(int count = 1) + { + while (!ReleaseSemaphore(m_hSema, count, nullptr)); + } +}; +#elif defined(__MACH__) +//--------------------------------------------------------- +// Semaphore (Apple iOS and OSX) +// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html +//--------------------------------------------------------- +class Semaphore +{ +private: + semaphore_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount); + assert(rc == KERN_SUCCESS); + (void)rc; + } + + ~Semaphore() + { + semaphore_destroy(mach_task_self(), m_sema); + } + + bool wait() + { + return semaphore_wait(m_sema) == KERN_SUCCESS; + } + + bool try_wait() + { + return timed_wait(0); + } + + bool timed_wait(std::uint64_t timeout_usecs) + { + mach_timespec_t ts; + ts.tv_sec = static_cast(timeout_usecs / 1000000); + ts.tv_nsec = static_cast((timeout_usecs % 1000000) * 1000); + + // added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html + kern_return_t rc = semaphore_timedwait(m_sema, ts); + return rc == KERN_SUCCESS; + } + + void signal() + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + + void signal(int count) + { + while (count-- > 0) + { + while (semaphore_signal(m_sema) != KERN_SUCCESS); + } + } +}; +#elif defined(__unix__) || defined(__MVS__) +//--------------------------------------------------------- +// Semaphore (POSIX, Linux, zOS) +//--------------------------------------------------------- +class Semaphore +{ +private: + sem_t m_sema; + + Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION; + +public: + Semaphore(int initialCount = 0) + { + assert(initialCount >= 0); + int rc = sem_init(&m_sema, 0, static_cast(initialCount)); + assert(rc == 0); + (void)rc; + } + + ~Semaphore() + { + sem_destroy(&m_sema); + } + + bool wait() + { + // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error + int rc; + do { + rc = sem_wait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool try_wait() + { + int rc; + do { + rc = sem_trywait(&m_sema); + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + bool timed_wait(std::uint64_t usecs) + { + struct timespec ts; + const int usecs_in_1_sec = 1000000; + const int nsecs_in_1_sec = 1000000000; +#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC + clock_gettime(CLOCK_MONOTONIC, &ts); +#else + clock_gettime(CLOCK_REALTIME, &ts); +#endif + ts.tv_sec += (time_t)(usecs / usecs_in_1_sec); + ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000; + // sem_timedwait bombs if you have more than 1e9 in tv_nsec + // so we have to clean things up before passing it in + if (ts.tv_nsec >= nsecs_in_1_sec) { + ts.tv_nsec -= nsecs_in_1_sec; + ++ts.tv_sec; + } + + int rc; + do { +#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC + rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts); +#else + rc = sem_timedwait(&m_sema, &ts); +#endif + } while (rc == -1 && errno == EINTR); + return rc == 0; + } + + void signal() + { + while (sem_post(&m_sema) == -1); + } + + void signal(int count) + { + while (count-- > 0) + { + while (sem_post(&m_sema) == -1); + } + } +}; +#else +#error Unsupported platform! (No semaphore wrapper available) +#endif + +} // end namespace details + + +//--------------------------------------------------------- +// LightweightSemaphore +//--------------------------------------------------------- +class LightweightSemaphore +{ +public: + typedef std::make_signed::type ssize_t; + +private: + std::atomic m_count; + details::Semaphore m_sema; + int m_maxSpins; + + bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) + { + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + std::atomic_signal_fence(std::memory_order_acquire); // Prevent the compiler from collapsing the loop. + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount > 0) + return true; + if (timeout_usecs < 0) + { + if (m_sema.wait()) + return true; + } + if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs)) + return true; + // At this point, we've timed out waiting for the semaphore, but the + // count is still decremented indicating we may still be waiting on + // it. So we have to re-adjust the count, but only if the semaphore + // wasn't signaled enough times for us too since then. If it was, we + // need to release the semaphore too. + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + return true; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return false; + } + } + + ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1) + { + assert(max > 0); + ssize_t oldCount; + int spin = m_maxSpins; + while (--spin >= 0) + { + oldCount = m_count.load(std::memory_order_relaxed); + if (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + std::atomic_signal_fence(std::memory_order_acquire); + } + oldCount = m_count.fetch_sub(1, std::memory_order_acquire); + if (oldCount <= 0) + { + if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs))) + { + while (true) + { + oldCount = m_count.load(std::memory_order_acquire); + if (oldCount >= 0 && m_sema.try_wait()) + break; + if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed)) + return 0; + } + } + } + if (max > 1) + return 1 + tryWaitMany(max - 1); + return 1; + } + +public: + LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins) + { + assert(initialCount >= 0); + assert(maxSpins >= 0); + } + + bool tryWait() + { + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed)) + return true; + } + return false; + } + + bool wait() + { + return tryWait() || waitWithPartialSpinning(); + } + + bool wait(std::int64_t timeout_usecs) + { + return tryWait() || waitWithPartialSpinning(timeout_usecs); + } + + // Acquires between 0 and (greedily) max, inclusive + ssize_t tryWaitMany(ssize_t max) + { + assert(max >= 0); + ssize_t oldCount = m_count.load(std::memory_order_relaxed); + while (oldCount > 0) + { + ssize_t newCount = oldCount > max ? oldCount - max : 0; + if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed)) + return oldCount - newCount; + } + return 0; + } + + // Acquires at least one, and (greedily) at most max + ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs) + { + assert(max >= 0); + ssize_t result = tryWaitMany(max); + if (result == 0 && max > 0) + result = waitManyWithPartialSpinning(max, timeout_usecs); + return result; + } + + ssize_t waitMany(ssize_t max) + { + ssize_t result = waitMany(max, -1); + assert(result > 0); + return result; + } + + void signal(ssize_t count = 1) + { + assert(count >= 0); + ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release); + ssize_t toRelease = -oldCount < count ? -oldCount : count; + if (toRelease > 0) + { + m_sema.signal((int)toRelease); + } + } + + std::size_t availableApprox() const + { + ssize_t count = m_count.load(std::memory_order_relaxed); + return count > 0 ? static_cast(count) : 0; + } +}; + +} // end namespace moodycamel + +// clang-format on diff --git a/csrc/utility/string_util.cpp b/csrc/utility/string_util.cpp index adc22055..bb147d85 100644 --- a/csrc/utility/string_util.cpp +++ b/csrc/utility/string_util.cpp @@ -483,5 +483,30 @@ std::string StringUtil::ToUpper(const std::string& s) { return rc; } +std::string StringUtil::RemoveLayerNumber(const std::string& input) { + std::string result; + const std::string pattern = ".layer."; + std::size_t pos = input.find(pattern); + + if (pos != std::string::npos) { + // 复制pattern前部分 + result = input.substr(0, pos); + // 找到 pattern 后的部分 + std::size_t start = pos + std::string(pattern).length(); + std::size_t end = input.find('.', start); + + // 复制 pattern后 部分 + if (end != std::string::npos) { + result += input.substr(end); + } else { + result += input.substr(start); // 如果没有点,复制到末尾 + } + } else { + // 如果没有pattern,返回原始字符串 + result = input; + } + return result; +} + } // namespace util } // namespace allspark diff --git a/csrc/utility/string_util.h b/csrc/utility/string_util.h index 94a76742..269bbd51 100644 --- a/csrc/utility/string_util.h +++ b/csrc/utility/string_util.h @@ -93,6 +93,8 @@ class StringUtil { static std::string ToLower(const std::string& s); static std::string ToUpper(const std::string& s); + + static std::string RemoveLayerNumber(const std::string& s); }; namespace notstd { diff --git a/docs/sphinx/get_started/quick_start_api_server_en.md b/docs/sphinx/get_started/quick_start_api_server_en.md index 63a065d2..8639d926 100644 --- a/docs/sphinx/get_started/quick_start_api_server_en.md +++ b/docs/sphinx/get_started/quick_start_api_server_en.md @@ -24,9 +24,9 @@ docker run \ - `-m`: Use Modelscope to download the model - `--model-path`: Path for loading or downloading the model - `--device-list`: List of CUDA devices used to run the model - + For example: - + ```shell docker run \ --shm-size=8g \ diff --git a/examples/benchmark/benchmark_throughput.py b/examples/benchmark/benchmark_throughput.py index aa8b8604..2c4d7ef5 100644 --- a/examples/benchmark/benchmark_throughput.py +++ b/examples/benchmark/benchmark_throughput.py @@ -15,6 +15,12 @@ import re from tabulate import tabulate +from queue import Queue, Empty + + +import uuid +import logging + import signal import sys @@ -69,6 +75,7 @@ class Request: class ProfilingData(): def __init__(self, model_name, config): self.device_type = config.device_type + self.batch_size = config.engine_max_batch self.device_num_col_name = "NUMA_num" if self.device_type == "CPU" else "Device_num" self.num_dev = len(config.device_ids) self.model_name = model_name.replace(" ", "_") @@ -99,6 +106,7 @@ def reset(self, max_batch): self.avg_throughput = 0.0 self.request_num = 0 self.qps = 0.0 + self.batch_size = 0 self.df = self.df.iloc[0:0] self.detail_df = self.detail_df.iloc[0:0] @@ -366,8 +374,11 @@ def __init__(self, args): self.test_random_input = args.test_random_input self.test_dataset_id = args.test_dataset_id self.prefix_cache_list = args.prefix_cache_rate_list + self.skip_unfinished_task = args.skip_unfinished + self.verbose = args.verbose self.guided_decode = args.guided_decode + self.dtype = args.dtype if self.engine_max_length - self.test_max_output <= 0: raise ValueError("engine max length too small, should at least largeer than test max output") @@ -388,9 +399,23 @@ def __init__(self, args): if rate < 0 or rate > 1.0: raise ValueError("prefix cache rate must between (0.0, 1.0]") -def one_request(request, engine, model_loader, stream_mode): + import json + + self.quant_config =json.loads(args.quant_config) + + if len(self.quant_config) > 0: + self.enable_quant = True + else: + self.enable_quant = False + + if self.was_weight_quant: + self.enable_quant = True + + + +def one_request(request, engine, model_loader, stream_mode, uuid, config): torch_input = request.torch_input - print("one request\n") + print(f"one request: start {uuid}\n") print(f"### generation_config: {request.gen_cfg}") output_ids = [] @@ -419,9 +444,11 @@ def one_request(request, engine, model_loader, stream_mode): request.status = int(status) if status == allspark.GenerateRequestStatus.Init: + if config.verbose: + print(f"request in init state: {uuid}") pass elif status == allspark.GenerateRequestStatus.Generating or status == allspark.GenerateRequestStatus.GenerateFinished: - # new_ids = self.engine.get_wait(self.model_name, request.queue) + #generated_elem = request.queue.GetWithTimeout(10) generated_elem = request.queue.Get() if generated_elem is not None: new_ids = generated_elem.ids_from_generate @@ -442,6 +469,8 @@ def one_request(request, engine, model_loader, stream_mode): request.out_text = model_loader.get_tokenizer().decode(request.out_tokens, skip_special_tokens=True) #if stream_mode: #yield request.out_text + else: + print("get empty output id.") if status == allspark.GenerateRequestStatus.GenerateFinished: request.generate_time = time.time() - time_after_ctx @@ -458,6 +487,10 @@ def one_request(request, engine, model_loader, stream_mode): engine.stop_request("model", request_handle) engine.release_request("model", request_handle=request.handle) + + if config.verbose: + print(f"one request: finish {uuid}\n") + return request def request_generator(freq, task_queue, request_list): @@ -466,42 +499,150 @@ def request_generator(freq, task_queue, request_list): task_queue.put(request) time.sleep(time_interleave) + +def setup_logging(log_file_path=None): + """ + Sets up the logging configuration. + + Args: + log_file_path (str, optional): Path to the log file. If None, logs to console. + """ + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + + if log_file_path: + file_handler = logging.FileHandler(log_file_path) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + else: + # Log to console + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + def request_processor(engine, config, model_loader, task_queue, future_list, progress_bar, task_generator_thread): - def done_callback(f): - request = f.argument - f.result() - progress_bar.update(1) - # engine_helper.print_inference_result(request) + """ + Processes requests using a thread pool executor. Assigns a unique UUID to each request, + logs the start and completion of each request, and monitors active tasks to continue + the stress test based on task stability. + + Args: + engine: The engine to process requests. + config: Configuration object containing settings like max_batch. + model_loader: Loader for the model. + task_queue (Queue): Queue containing incoming tasks. + future_list (list): List to keep track of submitted futures. + progress_bar: Progress bar object to update progress. + task_generator_thread (Thread): Thread responsible for generating tasks. + """ - # 创建线程池 + def done_callback(future): + """ + Callback function executed when a future is done. Logs completion and updates progress. + + Args: + future: The future object that has completed. + """ + try: + # Retrieve the result to catch any exceptions + result = future.result() + request_id = future.uuid + if config.verbose: + logging.info(f"[ReleaseRequest] Request {request_id} completed successfully.") + progress_bar.update(1) + # Optional: Uncomment to log the inference result + # logging.info(f"Inference result for {request_id}: {result}") + except Exception as e: + request_id = getattr(future, 'uuid', 'Unknown') + logging.error(f"[StopRequest] Request {request_id} failed with error: {e}") + progress_bar.update(1) + + # Create a thread pool with a maximum number of workers specified in config executor = ThreadPoolExecutor(max_workers=config.engine_max_batch) - print(f"current level : {config.engine_max_batch}") + logging.info(f"Current engine_max_batch level: {config.engine_max_batch}") + + # Variables to monitor task progression + last_active_count = 0 # Last recorded number of active tasks + last_change_time = time.time() # Last time the active task count changed + stability_threshold = 5 # Seconds to wait before considering the task count stable - # 使用线程池处理任务 while True: - if not task_queue.empty(): - # 从队列获取任务,如果队列为空,会阻塞直到队列中有新的任务 - # print("task not empty") - request = task_queue.get() - elif task_generator_thread.is_alive(): - # 如果队列为空,判断是否还在生成任务,如果还在生成任务就继续 - # print("task queue alive.") - continue + # Count the number of active (not yet completed) tasks + active_tasks = sum(1 for future in future_list if not future.done()) + + # Check if the number of active tasks has changed since the last check + if active_tasks != last_active_count: + # If change detected, update the count and reset the change timer + last_active_count = active_tasks + last_change_time = time.time() else: - # 如果队列为空,并且已经不在生成任务了,就跳出 - # print("task empty") - break - - def test_fun(one): - print("test fun", one) - return 1 - # 将任务提交到线程池 - kwargs = {'request': request, 'engine': engine, 'model_loader': model_loader, 'stream_mode': False} - - f = executor.submit(one_request, **kwargs) - f.argument = request - f.add_done_callback(done_callback) - future_list.append(f) + # If no change, check if the stability threshold has been reached + elapsed_time = time.time() - last_change_time + if elapsed_time >= stability_threshold: + logging.info(f"No change in active tasks for {stability_threshold} seconds. Continuing stress test.") + # Reset the timer to allow for future stability detections + last_change_time = time.time() + + # **Proceed with the stress test** + # Implement the logic to continue the stress test here. + # For example, add more tasks to the queue if needed + # new_tasks = generate_additional_tasks() + # for task in new_tasks: + # task_queue.put(task) + + if config.skip_unfinished_task: + logging.info("!!!!! Skip unifinished task ") + + executor.shutdown(wait=False) + return + else: + logging.info("!!!!! unifinished task deteched, add --skip_unfinished to args to continue the test.") + continue + + + try: + # Attempt to retrieve a task from the queue without blocking + request = task_queue.get_nowait() + except Empty: + if task_generator_thread.is_alive(): + # If the queue is empty but the task generator is still running, wait briefly + time.sleep(1) + continue + else: + # If the queue is empty and the generator has finished, check active tasks + if active_tasks == 0: + # All tasks have been processed; exit the loop + break + else: + # Wait for remaining active tasks to complete + time.sleep(1) + continue + + if request is not None: + # Assign a unique UUID to the request + request_uuid = str(uuid.uuid4()) + if config.verbose: + logging.info(f"[StartRequest] Starting request {request_uuid}.") + + # Submit the task to the executor for processing + kwargs = { + 'request': request, + 'engine': engine, + 'uuid': request_uuid, + 'model_loader': model_loader, + 'config' : config, + 'stream_mode': False + } + future = executor.submit(one_request, **kwargs) + future.uuid = request_uuid # Attach UUID to the future for reference in callback + future.add_done_callback(done_callback) + future_list.append(future) + + # Shutdown the executor and wait for all tasks to complete + executor.shutdown(wait=True) + logging.info("All tasks have been processed. Exiting request processor.") def test_model_stress(config: BenchmarkConfig): @@ -548,8 +689,7 @@ def test_model_stress(config: BenchmarkConfig): ## start engine. - in_memory = False - # 示例函数,用来运行模型 + in_memory = True safe_model_name = str(config.model_path).replace("/", "_") model_real_path = config.model_path @@ -560,13 +700,17 @@ def test_model_stress(config: BenchmarkConfig): # replace mmodel with path. model_loader = allspark.HuggingFaceModel(model_real_path, safe_model_name, in_memory_serialize=in_memory, + user_set_data_type=config.dtype, trust_remote_code=True) engine = allspark.Engine() (model_loader.load_model() .read_model_config() - .serialize(engine, model_output_dir=".", enable_quant=config.was_weight_quant, weight_only_quant=config.weight_only_quant) + .serialize(engine, model_output_dir=".", + enable_quant=config.enable_quant, + weight_only_quant=config.weight_only_quant, + customized_quant_config=config.quant_config) .free_model()) runtime_cfg_builder = model_loader.create_reference_runtime_config_builder("model", config.device_type, @@ -583,6 +727,7 @@ def test_model_stress(config: BenchmarkConfig): engine.start_model('model') + # like change to engine max length tokenizer = model_loader.read_model_config().init_tokenizer().get_tokenizer() @@ -676,7 +821,7 @@ def test_model_stress(config: BenchmarkConfig): engine.stop_model('model') engine.release_model('model') - + print("benchmark done.") progress_bar.close() # engine_helper.uninit_allspark_engine() @@ -713,7 +858,11 @@ def parse_float_list(value): def signal_handler(sig, frame): global running - logging.info("signal received, exiting") + print("signal received, exiting") + + pid = os.getpid() + os.kill(pid, signal.SIGKILL) + running = False if __name__ == '__main__': @@ -722,18 +871,19 @@ def list_of_ints(arg): return list(map(int, arg.split(','))) parser = argparse.ArgumentParser(description='Benchmark model with random data or provided data list.') parser.add_argument('--model_path', type=str, required=False, help='The name of the model to run', default="qwen/Qwen2-0.5B-Instruct") - parser.add_argument("--modelscope", type=bool, required=False, default=True, help="use modelscope download model") + parser.add_argument("--modelscope", action='store_true', required=False, default=True, help="use modelscope download model") parser.add_argument("--weight_quant", type=bool, required=False, default=False, help="use weight quant") parser.add_argument("--weight_only_quant", type=bool, required=False, default=False, help="do weight only quant") parser.add_argument("--cache_mode", type=str, required=False, default="default", help="kv cache mode : [defualt,8bit,4bit]") - parser.add_argument("--device_type", type=str, required=False, default="CUDA", help="device tyep [CUDA,CPU]") + parser.add_argument("--device_type", type=str, required=False, default="CUDA", help="device type [CUDA,CPU]") parser.add_argument("--device_ids", type=list_of_ints, required=False, default="0", help="device ids like 0,1") - parser.add_argument("--verbose", type=bool, required=False, default=False, help="verbose logging") - + parser.add_argument("--verbose", action='store_true', required=False, help="verbose logging") + parser.add_argument("--dtype", type=str, required=False, default="bfloat16", help="data type of model [bfloat16, float16, float32]") + parser.add_argument("--quant_config", type=str, required=False, default="{}", help="customized quant config for model.") parser.add_argument("--engine_max_length", type=int, required=False, default=8192, help="engine max length, dataset will be filtered by this length.") parser.add_argument("--engine_max_batch", type=int, required=False, default=32, help="engine max batch, this value same as test concurrency.") - parser.add_argument("--engine_enable_prefix_cache", type=bool, required=False, default=False, help="enable prefix cache.") + parser.add_argument("--engine_enable_prefix_cache", action='store_true', required=False, help="enable prefix cache.") parser.add_argument("--test_qps", type=float, required=True, help="send test request by seconds.") parser.add_argument("--test_sample_size", type=int, required=False, default=100, help="how many sample data should be tested.") @@ -741,7 +891,8 @@ def list_of_ints(arg): parser.add_argument("--test_dataset_path", type=str, required=False, default=None, help="data set used in benchmark.") parser.add_argument("--test_dataset_id", type=int, required=False, default=None, help="dataset id choose between [0,1,2,3]") parser.add_argument("--test_random_input", action="store_true", default=False, help="use random data to benchmark") - parser.add_argument("--guided_decode", type=bool, required=False, default=False, help="enable guided decode for json object.") + parser.add_argument("--guided_decode", action="store_true", default=False, help="enable guided decode for json object.") + parser.add_argument("--skip_unfinished", action="store_true", default=False, help="skip unfinished task, processing the test.") parser.add_argument("--prefix_cache_rate_list", type=parse_float_list, required=False, default=[], help="add one cache running list, for benchmark different cache hit rate result, must be in decending order, like 0.99, 0.9, 0.6, 0.3, benchmark result will in multiple line, first line was total cache miss") parser.add_argument( @@ -754,6 +905,9 @@ def list_of_ints(arg): args = parser.parse_args() print(f"test start with {args}") + + setup_logging() + # 设置信号处理 signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) diff --git a/examples/python/01_sync_example_bf16_gpu.py b/examples/python/01_sync_example_bf16_gpu.py index 9559d299..91149d6b 100644 --- a/examples/python/01_sync_example_bf16_gpu.py +++ b/examples/python/01_sync_example_bf16_gpu.py @@ -53,6 +53,9 @@ # like change to engine max length to a smaller value runtime_cfg_builder.max_length(2048) +# if you want to change kv cache span size, valid value is 16, 32, 64, 128; default is 32 +# runtime_cfg_builder.kv_cache_span_size(128) + # like enable int8 kv-cache or int4 kv cache rather than fp16 kv-cache # runtime_cfg_builder.kv_cache_mode(AsCacheMode.AsCacheQuantI8) diff --git a/python/allspark_binding.cpp b/python/allspark_binding.cpp index 90b49360..94370cb3 100644 --- a/python/allspark_binding.cpp +++ b/python/allspark_binding.cpp @@ -353,7 +353,9 @@ PYBIND11_MODULE(_allspark, m) { .def_property("num_experts", &ConfigProto::num_experts, &ConfigProto::set_num_experts) .def_property("num_experts_per_tok", &ConfigProto::num_experts_per_tok, - &ConfigProto::set_num_experts_per_tok); + &ConfigProto::set_num_experts_per_tok) + .def_property("intermediate_size", &ConfigProto::intermediate_size, + &ConfigProto::set_intermediate_size); // major, minor is polluted by macro introduced by sys/sysmacros.h #ifdef major @@ -577,7 +579,8 @@ PYBIND11_MODULE(_allspark, m) { py::arg("swap_threshold") = -1, py::arg("text_graph") = false, py::arg("num_threads") = 0, py::arg("matmul_precision") = "highest", py::arg("lora_names") = std::vector(), - py::arg("cache_span_size") = 16, py::arg("cache_span_num_init") = 0, + py::arg("cache_span_size") = AsModelConfig::default_span_size, + py::arg("cache_span_num_init") = 0, py::arg("cache_span_num_grow") = 0, py::arg("enable_prefix_cache") = true, py::arg("prefix_cache_ttl") = 300, @@ -618,7 +621,9 @@ PYBIND11_MODULE(_allspark, m) { .def_readwrite("eviction_strategy", &AsModelConfig::eviction_strategy) .def_readwrite("scheduling_strategy", &AsModelConfig::scheduling_strategy) .def_readwrite("enable_sparsity_matmul", - &AsModelConfig::enable_sparsity_matmul); + &AsModelConfig::enable_sparsity_matmul) + .def_readwrite("lora_max_rank", &AsModelConfig::lora_max_rank) + .def_readwrite("lora_max_num", &AsModelConfig::lora_max_num); py::class_(m, "AsEngineStat") .def(py::init<>()) diff --git a/python/allspark_binding_common.h b/python/allspark_binding_common.h index 069d24f5..32901df3 100644 --- a/python/allspark_binding_common.h +++ b/python/allspark_binding_common.h @@ -105,6 +105,19 @@ void bindAsStatus(py::module& m) { "throw if meets error.") .value("ALLSPARK_SUCCESS", AsStatus::ALLSPARK_SUCCESS) .value("ALLSPARK_PARAM_ERROR", AsStatus::ALLSPARK_PARAM_ERROR) + .value("ALLSPARK_EXCEED_LIMIT_ERROR", + AsStatus::ALLSPARK_EXCEED_LIMIT_ERROR) + .value("ALLSPARK_ALLSPARK_INVALID_CALL_ERROR", + AsStatus::ALLSPARK_INVALID_CALL_ERROR) + .value("ALLSPARK_REQUEST_DENIED", AsStatus::ALLSPARK_REQUEST_DENIED) + .value("ALLSPARK_LORA_NUM_EXCEED_LIMIT_ERROR", + AsStatus::ALLSPARK_LORA_NUM_EXCEED_LIMIT_ERROR) + .value("ALLSPARK_LORA_RANK_EXCEED_LIMIT_ERROR", + AsStatus::ALLSPARK_LORA_RANK_EXCEED_LIMIT_ERROR) + .value("ALLSPARK_LORA_NOT_FOUND", AsStatus::ALLSPARK_LORA_NOT_FOUND) + .value("ALLSPARK_LORA_ALREADY_LOADED", + AsStatus::ALLSPARK_LORA_ALREADY_LOADED) + .value("ALLSPARK_LORA_IN_USE", AsStatus::ALLSPARK_LORA_IN_USE) .value("ALLSPARK_STREAMING", AsStatus::ALLSPARK_STREAMING); } diff --git a/python/pyhie/allspark/__init__.py b/python/pyhie/allspark/__init__.py index fc1f8b13..0aa463c7 100644 --- a/python/pyhie/allspark/__init__.py +++ b/python/pyhie/allspark/__init__.py @@ -15,7 +15,6 @@ __all__ = [ "AsStatus", "AsModelConfig", - "ModelRuntimeConfig", "Engine", "GenerateRequestStatus", "MultiMediaInfo", diff --git a/python/pyhie/allspark/config/diconfig.py b/python/pyhie/allspark/config/diconfig.py index 06a35c1c..103bac2a 100644 --- a/python/pyhie/allspark/config/diconfig.py +++ b/python/pyhie/allspark/config/diconfig.py @@ -43,6 +43,8 @@ - AsCacheQuantI8 - int8 KV-Cache - AsCacheQuantU4 - uint4 KV-Cache""", + "kv_cache_span_size": "number of tokens that a KV cache span (i.e., page) can contain, valid value is 16, 32, 64, and 128; default: 32", + "eviction_strategy": "how to choose eviction request when kv-cache is full for GPU choose between(default : MaxLength): [MaxLength, Random]", "enable_prefix_cache": "prefill prefix caching function related settings, if you have lots of common prefix in prompts, this function is strongly suggested, default : TRUE, [TRUE, FALSE] ", @@ -161,6 +163,7 @@ def to_eviction_strategy_str(mode : AsEvictionStrategy): self.runtime_config_dict['compute_unit']['compute_thread_in_device'] = as_runtime_cfg.num_threads self.runtime_config_dict['kv_cache_mode'] = to_cache_mode_str(as_runtime_cfg.cache_mode) + self.runtime_config_dict['kv_cache_span_size'] = as_runtime_cfg.cache_span_size self.runtime_config_dict['enable_prefix_cache'] = as_runtime_cfg.enable_prefix_cache self.runtime_config_dict['prefix_cache_ttl'] = as_runtime_cfg.prefix_cache_ttl self.runtime_config_dict['cuda_mem'] = {} diff --git a/python/pyhie/allspark/model/qwen_v15.py b/python/pyhie/allspark/model/qwen_v15.py index 55e5681f..14c434b0 100644 --- a/python/pyhie/allspark/model/qwen_v15.py +++ b/python/pyhie/allspark/model/qwen_v15.py @@ -44,6 +44,7 @@ def _build_graph(self, torch_cfg, derive_type): cfg.kv_channels = int(hidden_size_ / cfg.num_heads) cfg.activation = get_activation(torch_cfg.get('hidden_act', "silu")) cfg.size_per_head = torch_cfg.get('size_per_head', 128) + cfg.intermediate_size = torch_cfg.get('intermediate_size', 0) cfg.is_generate = self.is_generate rope_scaling = torch_cfg.get('rope_scaling',{}) if rope_scaling is None: diff --git a/python/pyhie/allspark/model/qwen_v20_moe.py b/python/pyhie/allspark/model/qwen_v20_moe.py index d8b74cce..6e99d1cb 100644 --- a/python/pyhie/allspark/model/qwen_v20_moe.py +++ b/python/pyhie/allspark/model/qwen_v20_moe.py @@ -64,6 +64,7 @@ def _build_graph(self, torch_cfg, derive_type): cfg.kv_channels = int(cfg.hidden_size / cfg.num_heads) cfg.activation = get_activation(torch_cfg.get('hidden_act', "silu")) cfg.size_per_head = torch_cfg.get('size_per_head', 128) + cfg.intermediate_size = torch_cfg.get('intermediate_size', 0) cfg.is_generate = self.is_generate # daoxian added for span version diff --git a/python/pyhie/allspark/quantization.py b/python/pyhie/allspark/quantization.py index 343b5be8..59c34798 100644 --- a/python/pyhie/allspark/quantization.py +++ b/python/pyhie/allspark/quantization.py @@ -95,7 +95,7 @@ def __init__( ####################################### if quant_settings is None: - self.init_with_config_derecated(quan_json, quantize_op_type) + self.init_with_config_deprecated(quan_json, quantize_op_type) return # use new setting to init. @@ -135,7 +135,7 @@ def __init__( # TODO: if some sub-channel kernel not supported, convert them into per-channel with TP-Split.from # weight process will be in model's init function - def init_with_config_derecated(self, quan_json, quantize_op_type): + def init_with_config_deprecated(self, quan_json, quantize_op_type): valid_extra_option = { "SubChannel": False, "GroupSize": 512, diff --git a/python/pyhie/allspark/runtime_config.py b/python/pyhie/allspark/runtime_config.py index 8509d11e..65bfa281 100644 --- a/python/pyhie/allspark/runtime_config.py +++ b/python/pyhie/allspark/runtime_config.py @@ -28,6 +28,8 @@ def __init__(self): self.engine_max_length = 2048 self.new_runtime_cfg = AsModelConfig() + self.new_runtime_cfg.lora_max_rank = 64 + self.new_runtime_cfg.lora_max_num = 5 """ The Runtime config, such as max batch, max length, and runtime feature like kv-cache quantization, etc. @@ -133,8 +135,24 @@ def max_batch(self, batch: int) -> 'AsModelRuntimeConfigBuilder': self.new_runtime_cfg.engine_max_batch = batch elif isinstance(batch, str): self.new_runtime_cfg.engine_max_batch = int(batch) + return self + def lora_max_rank(self, rank: int) -> 'AsModelRuntimeConfigBuilder': + """Sets the maximum sequence length for the engine.""" + if isinstance(rank, int): + self.new_runtime_cfg.lora_max_rank = rank + elif isinstance(rank, str): + self.new_runtime_cfg.lora_max_rank = int(rank) + return self + + def lora_max_num(self, num: int) -> 'AsModelRuntimeConfigBuilder': + """Sets the maximum sequence length for the engine.""" + if isinstance(num, int): + self.new_runtime_cfg.lora_max_num = num + elif isinstance(num, str): + self.new_runtime_cfg.lora_max_num = int(num) return self + def max_prefill_length(self, length: int) -> 'AsModelRuntimeConfigBuilder': self.new_runtime_cfg.engine_max_prefill_length = length @@ -162,6 +180,13 @@ def enable_sparsity_matmul(self, enable=False) -> 'AsModelRuntimeConfigBuilder': def kv_cache_mode(self, cache_mode: AsCacheMode): self.new_runtime_cfg.cache_mode = cache_mode return self + + def kv_cache_span_size(self, span_size: int): + """ + Valid span_size is 16, 32, 64, and 128. Default is 32. + """ + self.new_runtime_cfg.cache_span_size = span_size + return self @staticmethod def parse_device_type(compute_unit): @@ -205,9 +230,11 @@ def update_from_dict(self, rfield): if "enable_prefix_cache" in rfield: self.prefill_cache(bool(rfield['enable_prefix_cache'])) if "prefix_cache_ttl" in rfield: - self.prefix_cache(rfield['prefix_cache_ttl']) + self.prefix_cache_ttl(int(rfield['prefix_cache_ttl'])) if "kv_cache_mode" in rfield: self.kv_cache_mode(get_cache_mode_from_str(rfield['kv_cache_mode'])) + if "kv_cache_span_size" in rfield: + self.kv_cache_span_size(int(rfield['kv_cache_span_size'])) if "enable_sparsity_matmul" in rfield: self.enable_sparsity_matmul(bool(rfield['enable_sparsity_matmul'])) if "cache_span_num_init" in rfield: diff --git a/python/setup.py b/python/setup.py index e8dd9adb..630a3305 100644 --- a/python/setup.py +++ b/python/setup.py @@ -64,7 +64,6 @@ def build_extension(self, ext): as_build_hiednn = os.getenv("AS_BUILD_HIEDNN", "ON") as_utest = os.getenv("BUILD_UTEST", "OFF") as_span_attn = os.getenv("ENABLE_SPAN_ATTENTION", "ON") - as_flash_attn = os.getenv("FLASHATTN_BUILD_FROM_SOURCE", "ON") py_pkg_name_prefix = py_pkg_name.split('-')[0] enable_glibcxx11_abi = os.getenv("AS_CXX11_ABI", "OFF") @@ -92,7 +91,6 @@ def build_extension(self, ext): "-DALWAYS_READ_LOAD_MODEL=OFF", "-DBUILD_HIEDNN=" + as_build_hiednn, "-DENABLE_SPAN_ATTENTION=" + as_span_attn, - "-DFLASHATTN_BUILD_FROM_SOURCE=" + as_flash_attn, "-DENABLE_JSON_MODE=ON", "-DENABLE_GLIBCXX11_ABI=" + enable_glibcxx11_abi, "-DENABLE_MULTINUMA=OFF", @@ -175,6 +173,7 @@ def os_script_exec(cmd: str): third_party_subfolder = os.path.join(cwd.parent.absolute(), "third_party", "from_source") + if as_build_hiednn == "ON": print(f"setup.py: build hiednn from source.") hiednn_src_folder = os.path.join(cwd.parent.absolute(), "HIE-DNN") @@ -185,7 +184,6 @@ def os_script_exec(cmd: str): else: raise ValueError("not found HIE-DNN source code.") - os.chdir(self.build_temp) # prepare for connan env. @@ -203,7 +201,6 @@ def os_script_exec(cmd: str): conanfile += "_arm" conanfile += ".txt" - conan_install_arm = Template( "conan profile new dashinfer_compiler_profile --detect --force\n" + "cp -f {{cwd_parent}}/conan/conanprofile_armclang.aarch64 ~/.conan/profiles/dashinfer_compiler_profile\n" + diff --git a/scripts/clang-format/clang-format-apply-all.sh b/scripts/clang-format/clang-format-apply-all.sh index 5c558383..739140b3 100755 --- a/scripts/clang-format/clang-format-apply-all.sh +++ b/scripts/clang-format/clang-format-apply-all.sh @@ -9,7 +9,7 @@ SCRIPT=$(readlink -f "$0") SCRIPTPATH=$(dirname "$SCRIPT") -# check that we are in a clean state in order to prevent accidential +# check that we are in a clean state in order to prevent accidental # changes if [ ! -z "$(git status --untracked-files=no --porcelain)" ]; then echo "Script must be applied on a clean git state" @@ -110,7 +110,7 @@ done notcorrectlist=`git status --porcelain | grep '^ M' | cut -c4-` # if nothing changed ok if [[ -z $notcorrectlist ]]; then - # send a negative message to gitlab + # send a negative message to git echo "Excellent. **VERY GOOD FORMATTING!** :thumbsup:" exit 0; else diff --git a/scripts/clang-format/clang-format-apply.sh b/scripts/clang-format/clang-format-apply.sh index c8c3f014..1fd8d751 100755 --- a/scripts/clang-format/clang-format-apply.sh +++ b/scripts/clang-format/clang-format-apply.sh @@ -10,7 +10,7 @@ SCRIPT=$(readlink -f "$0") SCRIPTPATH=$(dirname "$SCRIPT") TARGET_BRANCH=$1 -# check that we are in a clean state in order to prevent accidential +# check that we are in a clean state in order to prevent accidental # changes if [ ! -z "$(git status --untracked-files=no --porcelain)" ]; then echo "Script must be applied on a clean git state" @@ -111,7 +111,7 @@ done notcorrectlist=`git status --porcelain | grep '^ M' | cut -c4-` # if nothing changed ok if [[ -z $notcorrectlist ]]; then - # send a negative message to gitlab + # send a negative message to git echo "Excellent. **VERY GOOD FORMATTING!** :thumbsup:" exit 0; else diff --git a/scripts/copyright/add_copyright.py b/scripts/copyright/add_copyright.py deleted file mode 100644 index 8c5c0e4e..00000000 --- a/scripts/copyright/add_copyright.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import re - -def add_copyright_statement(file_path, file_name, new_copyright, old_copyright_pattern=None): - try: - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() - except FileNotFoundError as e: - print("File not found:", e) - return - - # 如果存在旧的版权声明,则替换为新的版权声明 - if old_copyright_pattern is not None and old_copyright_pattern.search(content): - updated_content = old_copyright_pattern.sub(new_copyright, content, count=1) - print(f"Replaced old copyright statement with new copyright statement in {file_name}") - else: - # 否则,在文件开头添加新的版权声明 - updated_content = new_copyright + content - print(f"Added new copyright statement to {file_name}") - - # 将更新后的内容写回文件 - with open(file_path, 'w', encoding='utf-8') as file: - file.write(updated_content) - - -def traverse_all_files(target_path, target_extensions, ignore_list, new_copyright_format, copyright_holder, old_copyright_pattern=None): - # 遍历文件夹 - for root, dirs, files in os.walk(target_path): - for file in files: - # 获取文件的扩展名 - _, extension = os.path.splitext(file) - if extension in target_extensions: - # 构建完整的文件路径 - file_path = os.path.join(root, file) - - if any(ignore_word in file_path for ignore_word in ignore_list): - # 跳过不加版权声明的路径和文件 - continue - - if os.path.islink(file_path): - # 如果是软链接则跳过 - continue - - # 替换旧的版权声明或添加新的版权声明 - new_copyright = new_copyright_format.format(copyright_holder, file) - add_copyright_statement(file_path, file, new_copyright, old_copyright_pattern) - # print(f"file_path: {file_path}, file: {file}, extension: {extension}") - - -if __name__ == '__main__': - # 设置文件夹路径 - target_path = '/root/workspace/DashInfer' - - # 以下路径的文件不加版权声明 - ignore_list = [ - 'HIE-DNN/doc/Doxyfile.in', - 'csrc/interface/dlpack.h', - 'csrc/utility/cnpy.cpp', - 'csrc/utility/cnpy.h', - 'csrc/utility/concurrentqueue.h', - 'third_party', - 'thirdparty', - 'add_copyright.py', - 'auto-format-python.py', - 'csrc/core/kernel/cuda/xformer_mha/kernel_forward.h', - 'examples/cpp/tokenizer', - 'examples/cpp/utils/CLI11.hpp', - 'examples/python/2_evaluation', - 'multimodal/dashinfer_vlm/vl_inference/utils/trt', - 'scripts/clang-format', - 'build' - ] - - # 设置版权持有者名称 - copyright_holder = 'Alibaba, Inc.' - - # 需要加入版权声明的文件后缀 - target_extensions_cpp = {'.c', '.cpp', '.h', '.hpp', '.cu', '.cuh', '.proto', '.in'} - target_extensions_py = {'.py'} - - # 创建新的版权声明 - new_copyright_format_cpp = "/*!\n * Copyright (c) {} and its affiliates.\n * @file {}\n */\n" - # new_copyright_format_py = "#\n# Copyright (c) {} and its affiliates.\n# @file {}\n#\n" - new_copyright_format_py = "'''\n Copyright (c) {} and its affiliates.\n @file {}\n'''\n" - - # 正则表达式匹配旧的版权声明 - old_copyright_pattern_cpp = re.compile(r"/\*!\n \* Copyright[\s\S]*?\*/\n", re.MULTILINE) - # old_copyright_pattern_py = re.compile(r"#\n# Copyright[\s\S]*?\n#\n", re.MULTILINE) - old_copyright_pattern_py = re.compile(r"'''\n Copyright[\s\S]*?\n'''\n", re.MULTILINE) - - # 遍历文件夹,更新版权声明 - traverse_all_files(target_path, target_extensions_cpp, ignore_list, new_copyright_format_cpp, copyright_holder, old_copyright_pattern_cpp) - traverse_all_files(target_path, target_extensions_py, ignore_list, new_copyright_format_py, copyright_holder, old_copyright_pattern_py) diff --git a/scripts/docker/build_cu124.sh b/scripts/docker/build_cu124.sh deleted file mode 100755 index 49a6fc30..00000000 --- a/scripts/docker/build_cu124.sh +++ /dev/null @@ -1,4 +0,0 @@ -cp ../../python/requirements_dev.txt . - -docker_ver=v1_py310 -docker build -f dev_cuda_124.Dockerfile . --build-arg PY_VER=3.10 -t dashinfer/dev-cu124:${docker_ver} diff --git a/scripts/docker/build_docker.sh b/scripts/docker/build_docker.sh deleted file mode 100755 index 6809b8a4..00000000 --- a/scripts/docker/build_docker.sh +++ /dev/null @@ -1,47 +0,0 @@ -set -x - -# yum install -y yum-utils -# yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo -# yum install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y -# systemctl start docker - -################### -# docker build -f dev_x86_ubuntu.Dockerfile --build-arg PY_VER=3.8 --format=docker -t dashinfer/dev-ubuntu-22.04-x86:v1 -# docker build -f dev_x86_ubuntu.Dockerfile --build-arg PY_VER=3.10 --format=docker -t dashinfer/dev-ubuntu-22.04-x86:v1_py310 - -# docker build -f dev_x86_centos7.Dockerfile --build-arg PY_VER=3.8 --format=docker -t dashinfer/dev-centos7-x86:v2 -# docker build -f dev_x86_centos7.Dockerfile --build-arg PY_VER=3.10 --format=docker -t dashinfer/dev-centos7-x86:v2_py310 - -# docker build -f dev_arm_alinux.Dockerfile --build-arg PY_VER=3.8 --format=docker -t dashinfer/dev-alinux-arm:v1 -# docker build -f dev_arm_alinux.Dockerfile --build-arg PY_VER=3.10 --format=docker -t dashinfer/dev-alinux-arm:v1_py310 - -# docker build -f dev_arm_centos8.Dockerfile --build-arg PY_VER=3.8 --format=docker -t dashinfer/dev-centos8-arm:v2 -# docker build -f dev_arm_centos8.Dockerfile --build-arg PY_VER=3.10 --format=docker -t dashinfer/dev-centos8-arm:v2_py310 -################### - -# docker build -f release_x86_manylinux2.Dockerfile --build-arg PY_VER=3.8 --format=docker -t localhost/dashinfer/dev-manylinux-x86:v2 -# docker build -f test_x86_ubuntu.Dockerfile --build-arg PY_VER=3.8 --format=docker -t localhost/dashinfer/test-ubuntu-x86:v1 - -# docker build -f release_aarch64_manylinux2.Dockerfile --build-arg PY_VER=3.8 --format=docker -t localhost/dashinfer/dev-manylinux-arm:v2 -# docker build -f test_aarch64_centos.Dockerfile --build-arg PY_VER=3.8 --format=docker -t localhost/dashinfer/test-centos-arm:v1 - -# docker push dashinfer/dev-ubuntu-22.04-x86:v1 -# docker push dashinfer/dev-ubuntu-22.04-x86:v1_py310 -# docker push dashinfer/dev-centos7-x86:v2 -# docker push dashinfer/dev-centos7-x86:v2_py310 -# docker push dashinfer/dev-manylinux-x86:v2 -# docker push dashinfer/dev-alinux-arm:v1 -# docker push dashinfer/dev-alinux-arm:v1_py310 -# docker push dashinfer/dev-centos8-arm:v2 -# docker push dashinfer/dev-centos8-arm:v2_py310 -# docker push dashinfer/dev-manylinux-arm:v2 - -# docker pull registry-1.docker.io/dashinfer/dev-ubuntu-22.04-x86:v1 -# docker pull registry-1.docker.io/dashinfer/dev-ubuntu-22.04-x86:v1_py310 -# docker pull registry-1.docker.io/dashinfer/dev-centos7-x86:v2 -# docker pull registry-1.docker.io/dashinfer/dev-centos7-x86:v2_py310 -# docker pull registry-1.docker.io/dashinfer/dev-manylinux-x86:v2 -# docker pull registry-1.docker.io/dashinfer/dev-alinux-arm:v1 -# docker pull registry-1.docker.io/dashinfer/dev-alinux-arm:v1_py310 - -set +x diff --git a/scripts/docker/build_fschat_ubuntu_cuda.sh b/scripts/docker/build_fschat_ubuntu_cuda.sh deleted file mode 100755 index f4d01b61..00000000 --- a/scripts/docker/build_fschat_ubuntu_cuda.sh +++ /dev/null @@ -1,9 +0,0 @@ -set -x - -cp ../../examples/python/4_fastchat/cuda/allspark_worker.py ./ - -dashinfer_version=v2.0 -docker build -f fschat_ubuntu_cuda.Dockerfile . -t dashinfer/fschat_ubuntu_cuda:${dashinfer_version} - -rm ./allspark_worker.py -set +x diff --git a/scripts/docker/build_fschat_ubuntu_x86.sh b/scripts/docker/build_fschat_ubuntu_x86.sh deleted file mode 100755 index 20f40eef..00000000 --- a/scripts/docker/build_fschat_ubuntu_x86.sh +++ /dev/null @@ -1,9 +0,0 @@ -set -x - -cp ../../examples/python/4_fastchat/cpu/allspark_worker.py ./ - -dashinfer_version=v1.2.1 -docker build -f fschat_ubuntu_x86.Dockerfile . -t dashinfer/fschat_ubuntu_x86:${dashinfer_version} - -rm ./allspark_worker.py -set +x diff --git a/scripts/docker/dev_arm_centos8.Dockerfile b/scripts/docker/dev_arm_centos8.Dockerfile index 04a0f749..c20d0ea5 100644 --- a/scripts/docker/dev_arm_centos8.Dockerfile +++ b/scripts/docker/dev_arm_centos8.Dockerfile @@ -75,9 +75,5 @@ RUN curl -LO https://github.com/NixOS/patchelf/archive/refs/tags/0.14.5.tar.gz & cd .. && rm -rf patchelf-0.14.5 0.14.5.tar.gz RUN pip3 install auditwheel==6.1.0 -RUN wget "https://xxxxxx/conan_allspark_source_arm_20241119.tar" && \ - tar -xvf conan_allspark_source_arm_20241119.tar && \ - mv conan_allspark_source_arm_20241119 /root/.conan && \ - rm -rf conan_allspark_source_arm_20241119.tar WORKDIR /root/ diff --git a/scripts/docker/dev_cuda_124.Dockerfile b/scripts/docker/dev_cuda_124.Dockerfile index e624fda9..7b085ab5 100644 --- a/scripts/docker/dev_cuda_124.Dockerfile +++ b/scripts/docker/dev_cuda_124.Dockerfile @@ -112,9 +112,5 @@ RUN curl -LO https://github.com/NixOS/patchelf/archive/refs/tags/0.14.5.tar.gz & cd .. && rm -rf patchelf-0.14.5 0.14.5.tar.gz RUN pip3 install auditwheel==6.1.0 -RUN wget "https://xxxxxx/conan_allspark_source_cuda124_20241203_verbose.tar" && \ - tar -xvf conan_allspark_source_cuda124_20241203_verbose.tar && \ - mv conan_allspark_source_cuda124_20241203_verbose /root/.conan && \ - rm -rf conan_allspark_source_cuda124_20241203_verbose.tar WORKDIR /root/ diff --git a/scripts/docker/dev_x86_centos7.Dockerfile b/scripts/docker/dev_x86_centos7.Dockerfile index 74bccdd1..8be6cbe4 100644 --- a/scripts/docker/dev_x86_centos7.Dockerfile +++ b/scripts/docker/dev_x86_centos7.Dockerfile @@ -100,9 +100,5 @@ RUN curl -LO https://github.com/NixOS/patchelf/archive/refs/tags/0.14.5.tar.gz & cd .. && rm -rf patchelf-0.14.5 0.14.5.tar.gz RUN pip3 install auditwheel==6.1.0 -RUN wget "https://xxxxxx/conan_allspark_source_x86_20241119.tar" && \ - tar -xvf conan_allspark_source_x86_20241119.tar && \ - mv conan_allspark_source_x86_20241119 /root/.conan && \ - rm -rf conan_allspark_source_x86_20241119.tar WORKDIR /root/ diff --git a/scripts/docker/fschat_ubuntu_cuda.Dockerfile b/scripts/docker/fschat_ubuntu_cuda.Dockerfile index a869d270..2f5b3576 100644 --- a/scripts/docker/fschat_ubuntu_cuda.Dockerfile +++ b/scripts/docker/fschat_ubuntu_cuda.Dockerfile @@ -1,4 +1,4 @@ -FROM dashinfer/dev-ubuntu-22.04-x86:v1_py310 +FROM dashinfer/test-ubuntu-cu124:v1 WORKDIR /workspace @@ -15,7 +15,8 @@ RUN pip install \ RUN pip uninstall pydantic -y \ && pip install -i https://mirrors.aliyun.com/pypi/simple pydantic==1.10.13 -RUN pip install 'dashinfer==2.0.0' +RUN wget https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc2/dashinfer-2.0.0rc2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl \ + && pip install dashinfer-2.0.0rc2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl RUN chmod +x ./fschat_entrypoint.sh diff --git a/scripts/docker/release_aarch64_manylinux2.Dockerfile b/scripts/docker/release_aarch64_manylinux2.Dockerfile index 1fcc01bc..323be02d 100644 --- a/scripts/docker/release_aarch64_manylinux2.Dockerfile +++ b/scripts/docker/release_aarch64_manylinux2.Dockerfile @@ -56,8 +56,5 @@ RUN wget "ftp://ftp.gnu.org/gnu/automake/automake-1.15.1.tar.gz" && \ cd automake-1.15.1 && ./configure && make -j && make install && \ cd .. && rm -rf automake-1.15.1.tar.gz automake-1.15.1 -RUN wget "https://xxxxxx/conan_allspark_source_arm_20241119.tar" && \ - tar -xvf conan_allspark_source_arm_20241119.tar && \ - mv conan_allspark_source_arm_20241119 /root/.conan && \ - rm -rf conan_allspark_source_arm_20241119.tar + WORKDIR /root/ diff --git a/scripts/docker/release_x86_manylinux2.Dockerfile b/scripts/docker/release_x86_manylinux2.Dockerfile index 2c241322..fe0131d1 100644 --- a/scripts/docker/release_x86_manylinux2.Dockerfile +++ b/scripts/docker/release_x86_manylinux2.Dockerfile @@ -61,9 +61,5 @@ RUN wget "ftp://ftp.gnu.org/gnu/automake/automake-1.15.1.tar.gz" && \ cd automake-1.15.1 && ./configure --prefix=/usr/local/ && make -j && make install && \ cd .. && rm -rf automake-1.15.1.tar.gz automake-1.15.1 -RUN wget "https://xxxxxx/conan_allspark_source_x86_20241119.tar" && \ - tar -xvf conan_allspark_source_x86_20241119.tar && \ - mv conan_allspark_source_x86_20241119 /root/.conan && \ - rm -rf conan_allspark_source_x86_20241119.tar WORKDIR /root/ diff --git a/scripts/release/python_manylinux_build_cuda.sh b/scripts/release/python_manylinux_build_cuda.sh index 661f226b..f2d8566e 100755 --- a/scripts/release/python_manylinux_build_cuda.sh +++ b/scripts/release/python_manylinux_build_cuda.sh @@ -38,7 +38,6 @@ function repair_wheel { else # TODO: add lib path to build lib path auditwheel repair "$wheel" --plat "$PLAT" -w ${REPO_ROOT}/python/wheelhouse/ --exclude libcublas.so.12 --exclude libcublasLt.so.12 --exclude libcudart.so.12 --exclude libcusparse.so.12 --exclude libnvJitLink.so.12 --exclude libcuda.so.1 --exclude libnccl.so.2 --exclude libnvidia-ml.so.1 --exclude libcudart.so.12 - fi } diff --git a/scripts/yapf-format/auto-format-python.py b/scripts/yapf-format/auto-format-python.py deleted file mode 100644 index 6aa0c557..00000000 --- a/scripts/yapf-format/auto-format-python.py +++ /dev/null @@ -1,40 +0,0 @@ -import os - -def traverse_all_files(target_path, target_extensions, ignore_list): - # 遍历文件夹 - for root, dirs, files in os.walk(target_path): - for file in files: - _, extension = os.path.splitext(file) - if extension in target_extensions: - # 构建完整的文件路径 - file_path = os.path.join(root, file) - - if any(ignore_word in file_path for ignore_word in ignore_list): - # 跳过忽略的文件和路径 - continue - - if os.path.islink(file_path): - # 跳过软链接 - continue - - print(f"auto-format {file_path}") - cmd = "yapf --style pep8 --in-place " + file_path - os.system(cmd) - - -if __name__ == '__main__': - # 设置文件夹路径 - target_path = '/root/workspace/DashInfer' - - ignore_list = [ - 'third_party', - 'thirdparty', - 'add_copyright.py', - 'auto-format-python.py', - 'build' - ] - - target_extensions = {'.py'} - - # 遍历文件夹,对python文件进行格式化 - traverse_all_files(target_path, target_extensions, ignore_list) diff --git a/tests/cpp/kernel/cuda/kernel_mhaprefill_test.cpp b/tests/cpp/kernel/cuda/kernel_mhaprefill_test.cpp index 64cd5011..f3470bbf 100644 --- a/tests/cpp/kernel/cuda/kernel_mhaprefill_test.cpp +++ b/tests/cpp/kernel/cuda/kernel_mhaprefill_test.cpp @@ -5,7 +5,6 @@ #include #include -#include #include #include @@ -13,6 +12,8 @@ #include #include #include +#include + #if 0 void reference_prefill_attention( const allspark::cuda::trivial_t& param, diff --git a/tests/cpp/model/cuda/model_cuda_test.cpp b/tests/cpp/model/cuda/model_cuda_test.cpp index e215491a..7ec71e24 100644 --- a/tests/cpp/model/cuda/model_cuda_test.cpp +++ b/tests/cpp/model/cuda/model_cuda_test.cpp @@ -241,7 +241,7 @@ TEST_F(AsModelCUDA, M6_7B_CacheDefault_Interrupted) { as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = qwen1_7b_test1_input_tokens_long.size() + 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheDefault; as_model_config.enable_prefix_cache = false; // disable cache to make more interrupt. @@ -412,7 +412,7 @@ TEST_F(AsModelCUDA, M6_7B_BS100) { as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheDefault; as_model_config.prefill_mode = AsMHAPrefill::AsPrefillXformer; as_model_config.enable_prefix_cache = false; @@ -573,7 +573,7 @@ TEST_F(AsModelCUDA, M6_7B_CacheDefault) { as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheDefault; as_model_config.prefill_mode = AsMHAPrefill::AsPrefillXformer; as_model_config.enable_prefix_cache = false; @@ -749,7 +749,7 @@ TEST_F(AsModelCUDA, M6_7B_CacheI8) { model_path + "/" + model_name + ".asparam", CUDA_DEVICE); as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheQuantI8; as_model_config.prefill_mode = AsMHAPrefill::AsPrefillXformer; as_model_config.enable_prefix_cache = false; @@ -894,7 +894,7 @@ TEST_F(AsModelCUDA, M6_7B_CacheU4) { model_path + "/" + model_name + ".asparam", CUDA_DEVICE); as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheQuantU4; as_model_config.prefill_mode = AsMHAPrefill::AsPrefillXformer; @@ -1161,7 +1161,7 @@ TEST_F(AsModelCUDA, M6_7B_RichEmbedding) { model_path + "/" + model_name + ".asparam", CUDA_DEVICE); as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.enable_prefix_cache = false; std::vector> gen_config_vec; @@ -1274,612 +1274,6 @@ TEST_F(AsModelCUDA, M6_7B_RichEmbedding) { EXPECT_EQ(in1.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); } -TEST_F(AsModelCUDA, CodeQwen15_7B_GQA_CacheDefault) { - const std::string model_name = "CodeQwen15_7B_Chat_fp16_A16W8_perc"; - const std::string model_path = std::string(getenv("ALLSPARK_TESTCASE_PATH")) + - "testcase/" + (model_name + "/"); - - constexpr int num_waves = 1; - constexpr int max_batch_in_test = 2; - constexpr int batch_size = 1; - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - */ - const std::vector in0_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296}; - const int seq_len = static_cast(in0_data.size()); - - const std::vector in1_data(batch_size * seq_len, 1); - - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - right = [x for x in arr if x > pivot]<|endoftext|> - */ - const std::vector out1_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296, 1396, 32, 2256, 1813, 2212, 35359, 1713, 2775, 1672, - 5586, 1963, 2775, 3215, 21328, 1701, 35396, 2}; - - const int out_len = static_cast(out1_data.size()); - - allspark::AsTensor in0("input_ids", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - allspark::AsTensor in1("attention_mask", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - - in0.CopyDataFrom(in0_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - in1.CopyDataFrom(in1_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - const DLTensorMap inputs = { - {"input_ids", in0.ToDLPack(device_context.get())}, - {"attention_mask", in1.ToDLPack(device_context.get())}}; - - std::string graph_path = model_path + "/" + model_name + ".asgraph"; - std::string weight_path = model_path + "/" + model_name + ".asparam"; - - AsModelConfig as_model_config = AsModelConfig( - model_name, graph_path.c_str(), weight_path.c_str(), CUDA_DEVICE); - - as_model_config.engine_max_batch = max_batch_in_test; - as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; - as_model_config.cache_mode = AsCacheMode::AsCacheDefault; - as_model_config.prefill_mode = AsMHAPrefill::AsPrefillDefault; - as_model_config.enable_prefix_cache = true; - - std::vector> gen_config_vec; - for (int i = 0; i < max_batch_in_test; ++i) { - auto cfg = std::make_unique(); - cfg->max_length = out_len; - cfg->early_stopping = false; - cfg->top_k = 1; - cfg->top_p = 0; - gen_config_vec.emplace_back(std::move(cfg)); - } - - allspark::AsEngine as_engine; - - auto file_version_info = - as_engine.GetFileInformation(graph_path.c_str(), weight_path.c_str()); - - ASSERT_EQ(file_version_info.create_version_graph, "2.0.0"); - ASSERT_EQ(file_version_info.create_version_param, "2.0.0"); - - ASSERT_EQ(as_engine.BuildModelFromConfigStruct(as_model_config), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(as_engine.StartModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - - std::vector> reqs( - max_batch_in_test); - std::vector pending_handles(max_batch_in_test); - std::vector pending_queue(max_batch_in_test); - - for (int i = 0; i < max_batch_in_test; ++i) { - std::shared_ptr req = - std::make_shared(); - req->config = *(gen_config_vec[i]); - req->infer_type = AsEngine::RequestInferType::Generate; - req->inputs = std::make_shared(inputs); - req->mm_type = AsEngine::RequestMMType::TextInput; - reqs[i] = std::move(req); - } - - util::Timer timer; - std::vector> result(max_batch_in_test); - - for (int wave = 0; wave < num_waves; ++wave) { - LOG(INFO) << "=================================================" - << "Wave " << wave - << "=================================================" - << std::endl; - - auto time_start = timer.elapsed(); - - // request wave 1 - for (int i = 0; i < max_batch_in_test; ++i) { - LOG(INFO) << "Wave " << wave << " test.start request: " << i; - - result[i] = std::async( - std::launch::async, [&, i, model_name]() -> allspark::AsStatus { - return as_engine.StartRequest(model_name.c_str(), reqs[i], - &(pending_handles[i]), - &(pending_queue[i])); - }); - } - - for (int i = 0; i < max_batch_in_test; ++i) { - EXPECT_EQ(result[i].get(), allspark::AsStatus::ALLSPARK_SUCCESS); - LOG(INFO) << "Wave " << wave << " test.start, finish request: " << i; - } - - // sync all - ASSERT_EQ(as_engine.SyncRequest(model_name.c_str(), nullptr), - allspark::AsStatus::ALLSPARK_SUCCESS); - - auto time_end = timer.elapsed(); - auto duration = time_end - time_start; - - size_t sum = 0; - for (auto q_ptr : pending_queue) { - if (!q_ptr) continue; - auto ele = q_ptr->Get(); - if (!ele) continue; - - sum += ele->ids_from_generate.size(); - std::vector result = in0_data; - result.insert(result.end(), - std::move_iterator(ele->ids_from_generate.begin()), - std::move_iterator(ele->ids_from_generate.end())); - - int64_t eps1 = check_equal(result.data(), out1_data.data(), - batch_size * out_len); - EXPECT_LE(eps1, MODEL_EPS); - } - int total_count = sum; - LOG(INFO) << "Wave " << wave << " Total Tokens " << total_count - << " ms: " << duration - << " throughput: " << (total_count) / (duration / 1000.0f); - - for (auto& handle : pending_handles) { - ASSERT_EQ(as_engine.ReleaseRequest(model_name.c_str(), handle), - allspark::AsStatus::ALLSPARK_SUCCESS); - static int idx = 0; - LOG(INFO) << "Wave " << wave << " test.release: " << idx++; - } - } - - LOG(INFO) << "=================================================" - << "Stop Model" - << "=================================================" << std::endl; - - // this is required to release the model loop thread - ASSERT_EQ(as_engine.StopModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in0.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in1.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); -} - -TEST_F(AsModelCUDA, CodeQwen15_7B_GQA_CacheI8) { - const std::string model_name = "CodeQwen15_7B_Chat_fp16_A16W8_perc"; - const std::string model_path = std::string(getenv("ALLSPARK_TESTCASE_PATH")) + - "testcase/" + (model_name + "/"); - - constexpr int num_waves = 1; - constexpr int max_batch_in_test = 2; - constexpr int batch_size = 1; - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - */ - const std::vector in0_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296}; - const int seq_len = static_cast(in0_data.size()); - - const std::vector in1_data(batch_size * seq_len, 1); - - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - right = [x for x in arr if x > pivot]<|endoftext|> - */ - const std::vector out1_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296, 1396, 32, 2256, 1813, 2212, 35359, 1713, 2775, 1672, - 5586, 1963, 2775, 3215, 21328, 1701, 35396, 2}; - - const int out_len = static_cast(out1_data.size()); - - allspark::AsTensor in0("input_ids", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - allspark::AsTensor in1("attention_mask", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - - in0.CopyDataFrom(in0_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - in1.CopyDataFrom(in1_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - const DLTensorMap inputs = { - {"input_ids", in0.ToDLPack(device_context.get())}, - {"attention_mask", in1.ToDLPack(device_context.get())}}; - - std::string graph_path = model_path + "/" + model_name + ".asgraph"; - std::string weight_path = model_path + "/" + model_name + ".asparam"; - - AsModelConfig as_model_config = AsModelConfig( - model_name, graph_path.c_str(), weight_path.c_str(), CUDA_DEVICE); - - as_model_config.engine_max_batch = max_batch_in_test; - as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; - as_model_config.cache_mode = AsCacheMode::AsCacheQuantI8; - as_model_config.prefill_mode = AsMHAPrefill::AsPrefillDefault; - as_model_config.enable_prefix_cache = true; - - std::vector> gen_config_vec; - for (int i = 0; i < max_batch_in_test; ++i) { - auto cfg = std::make_unique(); - cfg->max_length = out_len; - cfg->early_stopping = false; - cfg->top_k = 1; - cfg->top_p = 0; - gen_config_vec.emplace_back(std::move(cfg)); - } - - allspark::AsEngine as_engine; - - auto file_version_info = - as_engine.GetFileInformation(graph_path.c_str(), weight_path.c_str()); - - ASSERT_EQ(file_version_info.create_version_graph, "2.0.0"); - ASSERT_EQ(file_version_info.create_version_param, "2.0.0"); - - ASSERT_EQ(as_engine.BuildModelFromConfigStruct(as_model_config), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(as_engine.StartModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - - std::vector> reqs( - max_batch_in_test); - std::vector pending_handles(max_batch_in_test); - std::vector pending_queue(max_batch_in_test); - - for (int i = 0; i < max_batch_in_test; ++i) { - std::shared_ptr req = - std::make_shared(); - req->config = *(gen_config_vec[i]); - req->infer_type = AsEngine::RequestInferType::Generate; - req->inputs = std::make_shared(inputs); - req->mm_type = AsEngine::RequestMMType::TextInput; - reqs[i] = std::move(req); - } - - util::Timer timer; - std::vector> result(max_batch_in_test); - - for (int wave = 0; wave < num_waves; ++wave) { - LOG(INFO) << "=================================================" - << "Wave " << wave - << "=================================================" - << std::endl; - - auto time_start = timer.elapsed(); - - // request wave 1 - for (int i = 0; i < max_batch_in_test; ++i) { - LOG(INFO) << "Wave " << wave << " test.start request: " << i; - - result[i] = std::async( - std::launch::async, [&, i, model_name]() -> allspark::AsStatus { - return as_engine.StartRequest(model_name.c_str(), reqs[i], - &(pending_handles[i]), - &(pending_queue[i])); - }); - } - - for (int i = 0; i < max_batch_in_test; ++i) { - EXPECT_EQ(result[i].get(), allspark::AsStatus::ALLSPARK_SUCCESS); - LOG(INFO) << "Wave " << wave << " test.start, finish request: " << i; - } - - // sync all - ASSERT_EQ(as_engine.SyncRequest(model_name.c_str(), nullptr), - allspark::AsStatus::ALLSPARK_SUCCESS); - - auto time_end = timer.elapsed(); - auto duration = time_end - time_start; - - size_t sum = 0; - for (auto q_ptr : pending_queue) { - if (!q_ptr) continue; - auto ele = q_ptr->Get(); - if (!ele) continue; - - sum += ele->ids_from_generate.size(); - std::vector result = in0_data; - result.insert(result.end(), - std::move_iterator(ele->ids_from_generate.begin()), - std::move_iterator(ele->ids_from_generate.end())); - - int64_t eps1 = check_equal(result.data(), out1_data.data(), - batch_size * out_len); - EXPECT_LE(eps1, MODEL_EPS); - } - int total_count = sum; - LOG(INFO) << "Wave " << wave << " Total Tokens " << total_count - << " ms: " << duration - << " throughput: " << (total_count) / (duration / 1000.0f); - - for (auto& handle : pending_handles) { - ASSERT_EQ(as_engine.ReleaseRequest(model_name.c_str(), handle), - allspark::AsStatus::ALLSPARK_SUCCESS); - static int idx = 0; - LOG(INFO) << "Wave " << wave << " test.release: " << idx++; - } - } - - LOG(INFO) << "=================================================" - << "Stop Model" - << "=================================================" << std::endl; - - // this is required to release the model loop thread - ASSERT_EQ(as_engine.StopModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in0.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in1.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); -} - -TEST_F(AsModelCUDA, CodeQwen15_7B_GQA_CacheU4) { - const std::string model_name = "CodeQwen15_7B_Chat_fp16_A16W8_perc"; - const std::string model_path = std::string(getenv("ALLSPARK_TESTCASE_PATH")) + - "testcase/" + (model_name + "/"); - - constexpr int num_waves = 1; - constexpr int max_batch_in_test = 2; - constexpr int batch_size = 1; - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - */ - const std::vector in0_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296}; - const int seq_len = static_cast(in0_data.size()); - - const std::vector in1_data(batch_size * seq_len, 1); - - /* - def quick_sort(arr): - """Implement a simple quick sort.""" - if len(arr) <= 1: - return arr - pivot = arr[len(arr) // 2] - left = [x for x in arr if x < pivot] - middle = [x for x in arr if x == pivot] - return quick_sort(left) + middle + quick_sort(right) - right = [x for x in arr if x > pivot]<|endoftext|> - */ - const std::vector out1_data = { - 92295, 3138, 3904, 35356, 17278, 35354, 5025, 3405, 1396, 32, - 30737, 5173, 4571, 1643, 3998, 3904, 4843, 30224, 1396, 32, - 1740, 9527, 35354, 5025, 35353, 14202, 35321, 35381, 35371, 1396, - 33, 3166, 5586, 1396, 32, 35336, 1808, 1701, 1813, 5586, - 35395, 6371, 35354, 5025, 35353, 2367, 35321, 35385, 35396, 92297, - 1396, 32, 2520, 1813, 2212, 35359, 1713, 2775, 1672, 5586, - 1963, 2775, 2100, 21328, 1701, 35396, 1396, 32, 23996, 1813, - 2212, 35359, 1713, 2775, 1672, 5586, 1963, 2775, 2706, 21328, - 1701, 35396, 1396, 32, 3166, 3904, 35356, 17278, 35354, 2520, - 35353, 2279, 5803, 2279, 3904, 35356, 17278, 35354, 2256, 35353, - 92296, 1396, 32, 2256, 1813, 2212, 35359, 1713, 2775, 1672, - 5586, 1963, 2775, 3215, 21328, 1701, 35396, 2}; - - const int out_len = static_cast(out1_data.size()); - - allspark::AsTensor in0("input_ids", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - allspark::AsTensor in1("attention_mask", allspark::CPU, allspark::INT64, - allspark::DataMode::DENSE, - allspark::Shape({batch_size, seq_len})); - - in0.CopyDataFrom(in0_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - in1.CopyDataFrom(in1_data.data(), batch_size * seq_len * sizeof(int64_t), - allspark::CPU); - const DLTensorMap inputs = { - {"input_ids", in0.ToDLPack(device_context.get())}, - {"attention_mask", in1.ToDLPack(device_context.get())}}; - - std::string graph_path = model_path + "/" + model_name + ".asgraph"; - std::string weight_path = model_path + "/" + model_name + ".asparam"; - - AsModelConfig as_model_config = AsModelConfig( - model_name, graph_path.c_str(), weight_path.c_str(), CUDA_DEVICE); - - as_model_config.engine_max_batch = max_batch_in_test; - as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; - as_model_config.cache_mode = AsCacheMode::AsCacheQuantU4; - as_model_config.prefill_mode = AsMHAPrefill::AsPrefillDefault; - as_model_config.enable_prefix_cache = true; - - std::vector> gen_config_vec; - for (int i = 0; i < max_batch_in_test; ++i) { - auto cfg = std::make_unique(); - cfg->max_length = out_len; - cfg->early_stopping = false; - cfg->top_k = 1; - cfg->top_p = 0; - gen_config_vec.emplace_back(std::move(cfg)); - } - - allspark::AsEngine as_engine; - - auto file_version_info = - as_engine.GetFileInformation(graph_path.c_str(), weight_path.c_str()); - - ASSERT_EQ(file_version_info.create_version_graph, "2.0.0"); - ASSERT_EQ(file_version_info.create_version_param, "2.0.0"); - - ASSERT_EQ(as_engine.BuildModelFromConfigStruct(as_model_config), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(as_engine.StartModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - - std::vector> reqs( - max_batch_in_test); - std::vector pending_handles(max_batch_in_test); - std::vector pending_queue(max_batch_in_test); - - for (int i = 0; i < max_batch_in_test; ++i) { - std::shared_ptr req = - std::make_shared(); - req->config = *(gen_config_vec[i]); - req->infer_type = AsEngine::RequestInferType::Generate; - req->inputs = std::make_shared(inputs); - req->mm_type = AsEngine::RequestMMType::TextInput; - reqs[i] = std::move(req); - } - - util::Timer timer; - std::vector> result(max_batch_in_test); - - for (int wave = 0; wave < num_waves; ++wave) { - LOG(INFO) << "=================================================" - << "Wave " << wave - << "=================================================" - << std::endl; - - auto time_start = timer.elapsed(); - - // request wave 1 - for (int i = 0; i < max_batch_in_test; ++i) { - LOG(INFO) << "Wave " << wave << " test.start request: " << i; - - result[i] = std::async( - std::launch::async, [&, i, model_name]() -> allspark::AsStatus { - return as_engine.StartRequest(model_name.c_str(), reqs[i], - &(pending_handles[i]), - &(pending_queue[i])); - }); - } - - for (int i = 0; i < max_batch_in_test; ++i) { - EXPECT_EQ(result[i].get(), allspark::AsStatus::ALLSPARK_SUCCESS); - LOG(INFO) << "Wave " << wave << " test.start, finish request: " << i; - } - - // sync all - ASSERT_EQ(as_engine.SyncRequest(model_name.c_str(), nullptr), - allspark::AsStatus::ALLSPARK_SUCCESS); - - auto time_end = timer.elapsed(); - auto duration = time_end - time_start; - - size_t sum = 0; - for (auto q_ptr : pending_queue) { - if (!q_ptr) continue; - auto ele = q_ptr->Get(); - if (!ele) continue; - - sum += ele->ids_from_generate.size(); - std::vector result = in0_data; - result.insert(result.end(), - std::move_iterator(ele->ids_from_generate.begin()), - std::move_iterator(ele->ids_from_generate.end())); - - int64_t eps1 = check_equal(result.data(), out1_data.data(), - batch_size * out_len); - EXPECT_LE(eps1, MODEL_EPS); - } - int total_count = sum; - LOG(INFO) << "Wave " << wave << " Total Tokens " << total_count - << " ms: " << duration - << " throughput: " << (total_count) / (duration / 1000.0f); - - for (auto& handle : pending_handles) { - ASSERT_EQ(as_engine.ReleaseRequest(model_name.c_str(), handle), - allspark::AsStatus::ALLSPARK_SUCCESS); - static int idx = 0; - LOG(INFO) << "Wave " << wave << " test.release: " << idx++; - } - } - - LOG(INFO) << "=================================================" - << "Stop Model" - << "=================================================" << std::endl; - - // this is required to release the model loop thread - ASSERT_EQ(as_engine.StopModel(model_name.c_str()), - allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in0.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); - ASSERT_EQ(in1.Free(), allspark::AsStatus::ALLSPARK_SUCCESS); -} - TEST_F(AsModelCUDA, LLAMA_7B_ContinuousBatch) { std::string model_name = "llama2_7b"; const std::string model_path = std::string(getenv("ALLSPARK_TESTCASE_PATH")) + @@ -1942,7 +1336,7 @@ TEST_F(AsModelCUDA, LLAMA_7B_ContinuousBatch) { as_model_config.engine_max_batch = max_batch_in_test; as_model_config.engine_max_length = 1024; - as_model_config.cache_span_size = 32; + // as_model_config.cache_span_size = 32; as_model_config.cache_mode = AsCacheMode::AsCacheDefault; as_model_config.prefill_mode = AsMHAPrefill::AsPrefillXformer; as_model_config.enable_prefix_cache = false; diff --git a/tests/cpp/model/stresstest/model_stress_test.cpp b/tests/cpp/model/stresstest/model_stress_test.cpp index 60cb2b37..8d182c8a 100644 --- a/tests/cpp/model/stresstest/model_stress_test.cpp +++ b/tests/cpp/model/stresstest/model_stress_test.cpp @@ -462,6 +462,7 @@ int main(int argc, char** argv) { std::string model_type = std::string("Qwen_v20"); AsMHAPrefill prefill_mode = AsMHAPrefill::AsPrefillDefault; AsCacheMode kv_cache_mode = AsCacheMode::AsCacheDefault; + int kv_cache_span_size = AsModelConfig::default_span_size; int enable_flash_attention = 1; int device_num = 8; float top_k = 0; @@ -473,7 +474,7 @@ int main(int argc, char** argv) { std::string device_type = "CUDA"; std::string matmul_precision = "highest"; while ((opt = getopt(argc, argv, - "ht:f:r:b:d:l:m:M:F:P:N:k:p:w:C:a:c:o:s:n:")) != -1) { + "ht:f:r:b:d:l:m:M:F:P:N:k:p:w:C:a:c:z:o:s:n:")) != -1) { switch (opt) { case 'h': std::cout << "\nDESCRIPTION:\n" @@ -497,6 +498,8 @@ int main(int argc, char** argv) { << "-------------------\n" << " -C <0/1> run model on cpu, default 0\n" << " -c set span kv cache mode\n" + << " -z set kv cache span size, valid value is 16, " + "32, 64, 128; default: 32\n" << " -M model type. e.g. [m6_7b, m6_14b, " " m6_50b, m6_72b, m6_200b], only support m6_7b and " " m6_200b for now, default is m6_200b \n" @@ -560,6 +563,9 @@ int main(int argc, char** argv) { case 'c': kv_cache_mode = GetCacheMode(std::string(optarg)); break; + case 'z': + kv_cache_span_size = atoi(optarg); + break; case 'a': if (atoi(optarg) == 1) matmul_precision = "high"; @@ -617,7 +623,7 @@ int main(int argc, char** argv) { .withPrefillMode(prefill_mode) .withCacheSpanNumGrow(0) .withCacheSpanNumInit(0) - .withCacheSpanSize(16) + .withCacheSpanSize(kv_cache_span_size) .withCacheMode(kv_cache_mode) .build(); diff --git a/tests/cpp/operator/cuda/operator_gemm_lowp_test.cpp b/tests/cpp/operator/cuda/operator_gemm_lowp_test.cpp index c4e98d70..64084a23 100644 --- a/tests/cpp/operator/cuda/operator_gemm_lowp_test.cpp +++ b/tests/cpp/operator/cuda/operator_gemm_lowp_test.cpp @@ -924,6 +924,7 @@ TEST(GEMM_LOWP, FP16W8_NEW) { const int N_Range[2] = {8, 8192}; const int K_Range[3] = {8, 2048, 8192}; const int GS_Range[4] = {64, 128, 256, 512}; + // Test Ampere+ Fused PerC GEMV kernel and SubC GEMV kernel if (sm_version >= 0x0800) { float ave_diff_perc = 0.0f, ave_diff_perc_bf = 0.0f, ave_diff_subc = 0.0f; diff --git a/tests/python/arm/test_01_m6_7b_master.py b/tests/python/arm/test_01_m6_7b_master.py deleted file mode 100644 index 5fa18eba..00000000 --- a/tests/python/arm/test_01_m6_7b_master.py +++ /dev/null @@ -1,109 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_01_m6_7b_master.py -''' -import unittest -from dashinfer import allspark -from dashinfer.allspark.quantization import QuantizeConfig -import torch.utils.dlpack -import numpy as np -import os -import shutil -import subprocess - -CURRENT_PATH = os.path.split(__file__)[0:-1][0] - - -def process_7b_torch_model(pth_path): - model = torch.load(pth_path, map_location=lambda storage, loc: storage) - module = model['model']["language_model"]["encoder"] - module["word_embeddings"] = model['model']["language_model"]["embedding"][ - "word_embeddings"]["weight"] - module["lm_head.weight"] = model["model"]["language_model"][ - "output_layer"]['weight'] - for key in module.keys(): - module[key] = module[key].float() - torch_model = module - return torch_model - - -def build_a16w8_model(torch_model, - model_name, - model_type, - model_config, - device_type, - device_ids, - FLOAT_TYPE="float32"): - FLOAT_TYPE = FLOAT_TYPE.lower() - ACT_TYPE = "bfloat16" - INT_TYPE = "uint8" - - engine = allspark.Engine() - engine.set_device_type(device_type) - engine.set_device_ids(device_ids) - engine.build_model_from_torch( - model_name=model_name, - model_type=model_type, - torch_model=torch_model, - data_type=FLOAT_TYPE, - multigpu_mode=1, - model_config=model_config, - is_generate=True, - derive_type="lmhead", - do_dynamic_quantize_convert=True, # a16w8 - quant_config=QuantizeConfig(activation_type=ACT_TYPE, weight_type=INT_TYPE, extra_option={ - "SubChannel": True, - "GroupSize": 64 - }), - save_dir=os.path.join(CURRENT_PATH, model_name)) - return engine - - -class M6_7B_A16W8_ARM_TestCase(unittest.TestCase): - - def setUp(self): - self.models_root_path = os.environ.get("ALLSPARK_TESTCASE_PATH") - self.m6_7b_pt_model_path = os.path.join( - self.models_root_path, "testcase", - "m6_7b_a16w8/m6_7b_0802_chat.pt") - - # ModelType - self.model_type = "M6v3" - # Config - self.build_model_config = { - "layer_norm_eps": 1e-5, - "layernorm_epsilon": 1e-5, - "num_attention_heads": 32, - "num_hidden_layers": 32, - } - # TorchModel - self.torch_model = process_7b_torch_model(self.m6_7b_pt_model_path) - - def test_m6_7b_a16w8_arm(self): - model_name = "m6_7b_a16w8" - device_type = "CPU" - device_ids = [0] - - out_ids_ref = [[ - 101211, 9370, 65770, 105542, 101314, 11319, 151645, 198, 105678, - 9370, 65770, 36993, 20412, 104130, 1773, 151645, 198, 151643 - ]] - - engine = build_a16w8_model(self.torch_model, model_name, - self.model_type, self.build_model_config, - device_type, device_ids) - - return_code = subprocess.call( - "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ./mpirun --map-by numa:pe=64 -np 2 sh -c 'python3 test_01_m6_7b_worker.py'", - shell=True) - - out_sync = np.load(os.path.join(model_name, "out_sync.npy")).tolist() - - self.assertEqual(out_ids_ref, out_sync) - - # Clean - shutil.rmtree(os.path.join(CURRENT_PATH, model_name)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/python/arm/test_01_m6_7b_worker.py b/tests/python/arm/test_01_m6_7b_worker.py deleted file mode 100644 index 6d69e835..00000000 --- a/tests/python/arm/test_01_m6_7b_worker.py +++ /dev/null @@ -1,88 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_01_m6_7b_worker.py -''' -from dashinfer import allspark -import torch.utils.dlpack -import numpy as np -import os - -CURRENT_PATH = os.path.split(__file__)[0:-1][0] - -MAX_LENGTH = 8192 -MAX_BATCHSIZE = 1 - - -def build_as_model_config(model_name, device_type, device_ids): - as_model_path = os.path.join(CURRENT_PATH, model_name) - as_model_config = allspark.AsModelConfig( - model_name=model_name, - model_path=os.path.join(as_model_path, model_name + ".asgraph"), - weights_path=os.path.join(as_model_path, model_name + ".asparam"), - device_type=device_type, - device_ids=device_ids, - engine_max_length=MAX_LENGTH, - engine_max_batch=MAX_BATCHSIZE, - ) - return as_model_config - - -def run_model_sync(engine, model_name, in_ids, generate_config): - in_mask = (np.array(in_ids) != 0).astype(np.int64) - torch_input = { - "input_ids": torch.Tensor(in_ids).to(torch.int64), - "attention_mask": torch.Tensor(in_mask).to(torch.int64), - } - - generate_config["async"] = False - - out_ids = engine.run_text_generation(model_name, { - "input_ids": - torch.utils.dlpack.to_dlpack(torch_input["input_ids"]), - "attention_mask": - torch.utils.dlpack.to_dlpack(torch_input["attention_mask"]), - }, - generate_config=generate_config) - - if "generated_ids" in out_ids: - out_ids = torch.utils.dlpack.from_dlpack(out_ids["generated_ids"]) - out_list = out_ids.cpu().numpy().tolist() - np.save(os.path.join(model_name, "out_sync.npy"), - out_ids.cpu().numpy()) - return out_list - - -def test_model(): - model_name = "m6_7b_a16w8" - device_type = "CPU" - device_ids = [0] - - generate_config = { - 'num_beams': 1, - 'num_return_sequences': 1, - 'temperature': 1.0, - 'do_sample': True, - 'early_stopping': True, - 'top_k': 1, - 'top_p': 0.5, - 'max_length': MAX_LENGTH, - # 'stop_words_ids': [[151643], [151644], [151645]], # qwen_15w - 'eos_token_id': 151643, - 'seed': 1234, - 'loop_context': True - } - - in_ids = [[101211, 9370, 65770, 105542, 101314, 11319]] - - as_model_config = build_as_model_config(model_name, device_type, - device_ids) - - engine = allspark.Engine() - engine.build_model_from_config_struct(as_model_config) - engine.set_matmul_precision("medium") # highest/high/medium - - out_sync = run_model_sync(engine, model_name, in_ids, generate_config) - - -if __name__ == "__main__": - test_model() diff --git a/tests/python/arm/test_02_m6_13b_master.py b/tests/python/arm/test_02_m6_13b_master.py deleted file mode 100644 index 368e0976..00000000 --- a/tests/python/arm/test_02_m6_13b_master.py +++ /dev/null @@ -1,107 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_02_m6_13b_master.py -''' -import unittest -from dashinfer import allspark -from dashinfer.allspark.quantization import QuantizeConfig -import torch.utils.dlpack -import numpy as np -import os -import shutil -import subprocess - -CURRENT_PATH = os.path.split(__file__)[0:-1][0] - - -def process_13b_torch_model(pth_path): - model = torch.load(pth_path, map_location=lambda storage, loc: storage) - module = model["model"]["language_model"]["encoder"] - module["word_embeddings"] = model["model"]["language_model"]["embedding"][ - "word_embeddings"]["weight"] - for key in module.keys(): - module[key] = module[key].float() - torch_model = module - return torch_model - - -def build_a16w8_model(torch_model, - model_name, - model_type, - model_config, - device_type, - device_ids, - FLOAT_TYPE="float32"): - FLOAT_TYPE = FLOAT_TYPE.lower() - ACT_TYPE = "bfloat16" - INT_TYPE = "uint8" - - engine = allspark.Engine() - engine.set_device_type(device_type) - engine.set_device_ids(device_ids) - engine.build_model_from_torch( - model_name=model_name, - model_type=model_type, - torch_model=torch_model, - data_type=FLOAT_TYPE, - multigpu_mode=1, - model_config=model_config, - is_generate=True, - derive_type="lmhead", - do_dynamic_quantize_convert=True, # a16w8 - quant_config=QuantizeConfig(activation_type=ACT_TYPE, weight_type=INT_TYPE, extra_option={ - "SubChannel": True, - "GroupSize": 64 - }), - save_dir=os.path.join(CURRENT_PATH, model_name)) - return engine - - -class M6_13B_A16W8_ARM_TestCase(unittest.TestCase): - - def setUp(self): - self.models_root_path = os.environ.get("ALLSPARK_TESTCASE_PATH") - self.m6_13b_pt_model_path = os.path.join(self.models_root_path, - "testcase", - "m6_13b_8192/m6_13b_8192.pt") - - # ModelType - self.model_type = "M6v2SplitFFN" - # Config - self.build_model_config = { - "layer_norm_eps": 1e-5, - "layernorm_epsilon": 1e-5, - "num_attention_heads": 40, - "num_hidden_layers": 40, - } - # TorchModel - self.torch_model = process_13b_torch_model(self.m6_13b_pt_model_path) - - def test_m6_13b_a16w8_arm(self): - model_name = "m6_13b_a16w8" - device_type = "CPU" - device_ids = [0] - - out_ids_ref = [[ - 628, 20490, 25, 734, 5556, 1936, 4961, 628, 48902, 25, 4930, 5556, - 1936, 21767, 3598, 357, 22, 737, 628, 50256 - ]] - - engine = build_a16w8_model(self.torch_model, model_name, - self.model_type, self.build_model_config, - device_type, device_ids) - - return_code = subprocess.call( - "OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 ./mpirun --map-by numa:pe=64 -np 2 sh -c 'python3 test_02_m6_13b_worker.py'", - shell=True) - - out_sync = np.load(os.path.join(model_name, "out_sync.npy")).tolist() - - self.assertEqual(out_ids_ref, out_sync) - - # Clean - shutil.rmtree(os.path.join(CURRENT_PATH, model_name)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/python/arm/test_02_m6_13b_worker.py b/tests/python/arm/test_02_m6_13b_worker.py deleted file mode 100644 index 4f215091..00000000 --- a/tests/python/arm/test_02_m6_13b_worker.py +++ /dev/null @@ -1,87 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_02_m6_13b_worker.py -''' -from dashinfer import allspark -import torch.utils.dlpack -import numpy as np -import os - -CURRENT_PATH = os.path.split(__file__)[0:-1][0] - -MAX_LENGTH = 8192 -MAX_BATCHSIZE = 1 - - -def build_as_model_config(model_name, device_type, device_ids): - as_model_path = os.path.join(CURRENT_PATH, model_name) - as_model_config = allspark.AsModelConfig( - model_name=model_name, - model_path=os.path.join(as_model_path, model_name + ".asgraph"), - weights_path=os.path.join(as_model_path, model_name + ".asparam"), - device_type=device_type, - device_ids=device_ids, - engine_max_length=MAX_LENGTH, - engine_max_batch=MAX_BATCHSIZE, - ) - return as_model_config - - -def run_model_sync(engine, model_name, in_ids, generate_config): - in_mask = (np.array(in_ids) != 0).astype(np.int64) - torch_input = { - "input_ids": torch.Tensor(in_ids).to(torch.int64), - "attention_mask": torch.Tensor(in_mask).to(torch.int64), - } - - generate_config["async"] = False - - out_ids = engine.run_text_generation(model_name, { - "input_ids": - torch.utils.dlpack.to_dlpack(torch_input["input_ids"]), - "attention_mask": - torch.utils.dlpack.to_dlpack(torch_input["attention_mask"]), - }, - generate_config=generate_config) - - if "generated_ids" in out_ids: - out_ids = torch.utils.dlpack.from_dlpack(out_ids["generated_ids"]) - out_list = out_ids.cpu().numpy().tolist() - np.save(os.path.join(model_name, "out_sync.npy"), - out_ids.cpu().numpy()) - return out_list - - -def test_model(): - model_name = "m6_13b_a16w8" - device_type = "CPU" - device_ids = [0] - - generate_config = { - 'num_beams': 1, - 'num_return_sequences': 1, - 'temperature': 1.0, - 'do_sample': True, - 'early_stopping': True, - 'top_k': 1, - 'top_p': 0.5, - 'max_length': MAX_LENGTH, - 'eos_token_id': 50256, - 'seed': 42, - 'loop_context': True - } - - in_ids = [[628, 20490, 25, 734, 5556, 1936, 4961, 628, 48902, 25]] - - as_model_config = build_as_model_config(model_name, device_type, - device_ids) - - engine = allspark.Engine() - engine.build_model_from_config_struct(as_model_config) - engine.set_matmul_precision("medium") # highest/high/medium - - out_sync = run_model_sync(engine, model_name, in_ids, generate_config) - - -if __name__ == "__main__": - test_model() diff --git a/tests/python/gpu/test_04_model_serializer.py b/tests/python/gpu/test_04_model_serializer.py deleted file mode 100644 index 47cc6fa6..00000000 --- a/tests/python/gpu/test_04_model_serializer.py +++ /dev/null @@ -1,238 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_04_model_serializer.py -''' -import os -import gc -import unittest -import tempfile - -from dashinfer import allspark -from test_utils import LevenshteinCompare, CosineCompare, JaccardCompare, GenerateResultCompare - -from test_util_infer import func_test_model_with_reference, download_folder_from_oss - - -code_qwen_sql= """ -"你是一名PostgreSQL专家,现在需要阅读并理解下面的【数据库schema】描述,以及可能用到的【参考信息】,并运用PostgreSQL知识生成sql语句回答【用户问题】,用Markdown代码段形式输出代码,不要做额外解释。\n【用户问题】\nWhich conference published the most publications in the last 15 years? Give the conference name and publication count.\n\n【数据库schema】\n【DB_ID】 sql_eval_academic\n【Schema】\n# Table: author, \n[\n (aid:bigint),\n (homepage:text, Examples: [http://www.larry.com, http://www.ashish.com, http://www.noam.com]),\n (name:text, Examples: [Kempinski, Martin Odersky, Ashish Vaswani]),\n (oid:bigint)\n]\n# Table: cite, \n[\n (cited:bigint),\n (citing:bigint)\n]\n# Table: conference, \n[\n (cid:bigint),\n (homepage:text, Examples: [http://www.icml.com, http://www.aaas.com, http://www.isa.com]),\n (name:text, Examples: [ISA, AAAS, ICML])\n]\n# Table: domain, \n[\n (did:bigint),\n (name:text, Examples: [Natural Sciences, Computer Science, Sociology])\n]\n# Table: domain_author, \n[\n (aid:bigint),\n (did:bigint)\n]\n# Table: domain_conference, \n[\n (cid:bigint),\n (did:bigint)\n]\n# Table: domain_journal, \n[\n (did:bigint),\n (jid:bigint)\n]\n# Table: domain_keyword, \n[\n (did:bigint),\n (kid:bigint)\n]\n# Table: domain_publication, \n[\n (did:bigint),\n (pid:bigint)\n]\n# Table: journal, \n[\n (homepage:text, Examples: [http://www.ml.com, http://www.aijournal.com, http://www.science.com]),\n (jid:bigint),\n (name:text, Examples: [Journal of Artificial Intelligence Research])\n]\n# Table: keyword, \n[\n (keyword:text, Examples: [Neuroscience, Machine Learning, AI]),\n (kid:bigint)\n]\n# Table: organization, \n[\n (continent:text, Examples: [Asia, North America, Europe]),\n (homepage:text, Examples: [http://www.epfl.com]),\n (name:text, Examples: [Organization 2, Organization 1, Organization 5]),\n (oid:bigint)\n]\n# Table: publication, \n[\n (abstract:text, Examples: [Abstract 3, Abstract 4, Abstract 1]),\n (cid:bigint),\n (citation_num:bigint),\n (jid:bigint),\n (pid:bigint),\n (reference_num:bigint),\n (title:text, Examples: [Attention is all you need]),\n (year:bigint)\n]\n# Table: publication_keyword, \n[\n (pid:bigint),\n (kid:bigint)\n]\n# Table: writes, \n[\n (aid:bigint),\n (pid:bigint)\n]\n\n【参考信息】\n\n\n【用户问题】\nWhich conference published the most publications in the last 15 years? Give the conference name and publication count.\n\n" -""" -code_qwen_sql_answer="""```sql\nSELECT T3.name, COUNT(*) AS publication_count\nFROM conference AS T1\nJOIN publication AS T2 ON T1.cid = T2.cid\nWHERE T2.year BETWEEN YEAR(CURRENT_DATE) - 15 AND YEAR(CURRENT_DATE)\nGROUP BY T3.name\nORDER BY publication_count DESC\nLIMIT 1;```""" - -similarity_test_cases = { - "qwen/Qwen2-72B-Instruct-GPTQ-Int8-Sparse": - {"model_name": "qwen/Qwen2-72B-Instruct-GPTQ-Int8-Sparse", "input": ["你是谁?"], - "reference": ["我是来自阿里云的大规模语言模型,我叫通义千问。我能够回答各种问题、提供信息和与用户进行对话交流。有什么我可以帮助你的吗?<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen2-72B-Instruct-GPTQ-Int8": - {"model_name": "qwen/Qwen2-72B-Instruct-GPTQ-Int8", "input": ["你是谁?"], - "reference": ["我是阿里云开发的一款超大规模语言模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/CodeQwen1.5-7B-Chat": - {"model_name": "qwen/CodeQwen1.5-7B-Chat", "input": [code_qwen_sql], - "reference": [code_qwen_sql_answer], - "lang": "en", - "compare": LevenshteinCompare(), "threshold": 0.5, - "generation_params": {"top_k": 50, "top_p": 0.8, "repetition_penalty": 1.1, "temperature": 1.0, "seed": 1234} - }, - "qwen/Qwen2-7B-Instruct-GPTQ-Int8": - {"model_name": "qwen/Qwen2-7B-Instruct-GPTQ-Int8", "input": ["你是谁?"], - "reference": ["我是阿里云开发的一款超大规模语言模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.5 - }, - "qwen/Qwen2-7B-Instruct": - {"model_name": "qwen/Qwen2-7B-Instruct", "input": ["静夜思这首诗是谁写的?只回答作者名字。"], - "reference": ["李白<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen-14B-Chat": - {"model_name": "qwen/Qwen-14B-Chat", "input": ["你是谁?"], - "reference": ["我是来自阿里云的大规模语言模型,我叫通义千问。<|im_end|>\n<|endoftext|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen1.5-14B-Chat": - {"model_name": "qwen/Qwen1.5-14B-Chat", "input": ["静夜思这首诗是谁写的?只回答名字。"], - "reference": ["李白<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.5 - }, - "qwen/Qwen-7B-Chat": - {"model_name": "qwen/Qwen-7B-Chat", "input": ["帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "你犹如璀璨星辰,照亮了我们的项目之路;你的存在,如同瑰宝般珍贵,让我们的项目熠熠生辉。没有你的参与,我们的项目就如同失去灵魂的躯壳,注定走向失败。你的贡献,是我们成功的关键,你的智慧和才华,是我们前进的动力。感谢你的付出,让我们能够在这个项目中取得如此辉煌的成就。<|im_end|>"], - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.5 - }, - - "qwen/Qwen1.5-7B-Chat-GPTQ-Int4": - {"model_name": "qwen/Qwen1.5-7B-Chat-GPTQ-Int4", "input": ["你是谁?"], - "reference": ["我是来自阿里云的大规模语言模型,我叫通义千问。<|im_end|>\n<|endoftext|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen1.5-4B-Chat": - {"model_name": "qwen/Qwen1.5-4B-Chat", "input": ["你是谁?"], - "reference": [ - "我是来自阿里云的大规模语言模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen1.5-14B-Chat-GPTQ-Int4": - {"model_name": "qwen/Qwen1.5-14B-Chat-GPTQ-Int4", "input": ["你是谁?"], - "reference": [ - "我是通义千问,由阿里云开发的AI助手。我被设计用来回答各种问题、提供信息和进行对话。有什么我可以帮助你的吗?<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "qwen/Qwen1.5-4B-Chat-GPTQ-Int8": - {"model_name": "qwen/Qwen1.5-4B-Chat-GPTQ-Int8", "input": ["你是谁?"], - "reference": [ - "我是来自阿里云的大规模语言模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - - "qwen/Qwen2-7B-Chat": - {"model_name": "qwen/Qwen2-7B-Chat", "input": ["你是谁?"], - "reference": [ - "我是通义千问,由阿里云开发的人工智能助手。我可以回答各种问题、提供信息和与用户进行对话等。如果您有任何问题或需要帮助,请随时告诉我,我会尽力为您提供支持。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - - "qwen/Qwen-7B-Chat-Int8": - {"model_name": "qwen/Qwen-7B-Chat-Int8", "input": ["你是谁?"], - "reference": [ - "我是通义千问,由阿里云开发的人工智能助手。我可以回答各种问题、提供信息和与用户进行对话。有什么我可以帮助你的吗?<|im_end|>\n<|endoftext|>"], - "lang": "zh", - "compare": JaccardCompare(), "threshold": 0.3, - "generation_params": {"top_k": 20, "top_p": 0.8, "repetition_penalty": 1.05, "temperature": 0.7} - }, - "LLM-Research/Meta-Llama-3-8B": - {"model_name": "LLM-Research/Meta-Llama-3-8B", "input": ["你是谁?"], - "reference": [ - "我是LLama3-Chinese,一个由ShareAI训练的大型语言模型。我的目的是协助您完成各种任务。您需要我帮您做什么?<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.2 # FIXME: llama3 template needs update. - }, - "ZhipuAI/chatglm3-6b": - {"model_name": "ZhipuAI/chatglm3-6b", "input": ["你是谁?"], - "reference": ["我是你的助手,有什么我可以帮助你的吗?<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.2 # FIXME: chatglm template needs update. - }, - "qwen/Qwen2-72B-A8W8-PerChannel": - {"model_name": "qwen/Qwen2-72B-A8W8-PerChannel", "input": ["你是谁?"], - "reference": ["我是阿里云开发的一款超大规模语言模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8, - }, -} - -class ModelSimilarityTest(unittest.TestCase): - def setUp(self): - self.similarity_test_cases = similarity_test_cases - self.engine = allspark.Engine() - # 创建模型实例 - - def tearDown(self): - self.engine = None - gc.collect() - - def func_test_model_with_reference(self, test_dict, init_quant=False, test=None, weight_only_quant=True) -> float: - # self.engine = None - # let engine destroy, free all resources. - # gc.collect() - # full gc, make engine destroy called. - return func_test_model_with_reference(test_dict, self.engine, init_quant, test, - weight_only_quant=weight_only_quant) - - def test_inference_qwen1_models_fp16(self): - func_test_model_with_reference(self.similarity_test_cases["qwen/Qwen-7B-Chat"], test=self) - - def test_inference_qwen2_with_fp16(self): - func_test_model_with_reference(self.similarity_test_cases["qwen/Qwen1.5-4B-Chat"], test=self) - - def disabled_test_inference_qwen1_5_with_gptq_int8_weight_only(self): - func_test_model_with_reference(self.similarity_test_cases["qwen/Qwen1.5-4B-Chat-GPTQ-Int8"], - init_quant=True, weight_only_quant=True, test=self) - - # this case, the default model enable exllama, but failed to install depends: - # (21:29:22) ValueError: Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU.You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object - # need modify the model's config to disable exllama to load: config.json: "disable_exllama": true, - def disabled_test_inference_qwen2_with_gptq_int4(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen1.5-14B-Chat-GPTQ-Int4"], init_quant=True, - weight_only_quant=True, test=self) - def disable_test_inference_qwen1_5_14b_fp16(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen1.5-14B-Chat"], init_quant=False, - weight_only_quant=False, test=self) - def disable_test_inference_qwen1_models_int8_weight_only(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen-7B-Chat-Int8"], init_quant=True, - weight_only_quant=True, test=self) - def disable_test_inference_qwen1_models_int8(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen-7B-Chat-Int8"], init_quant=True, test=self) - - def test_inference_qwen2_models_int8(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct-GPTQ-Int8"], init_quant=True, - weight_only_quant=False, test=self) - - def test_inference_qwen2_models_int8_sparse(self): - import torch - if not (8 <= torch.cuda.get_device_capability()[0] < 9): - return - folder_key = 'xchen/70b-wanda_mlp_0618_seqlen-32768-sft2000_dpo2800_megatron2hf-smoothquant-ns2560-sl2048-int8-perchannel-base/' - target_path = "/root/.cache/modelscope/hub/qwen/Qwen2-72B-Instruct-GPTQ-Int8-Sparse" - if not os.path.exists(target_path): - os.mkdir(target_path) - download_folder_from_oss(folder_key, target_path, max_workers=162) - - enable_sparsity_matmul = True - tp = 2 - device_list = [i for i in range(tp)] - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-72B-Instruct-GPTQ-Int8-Sparse"], init_quant=True, - ms_download=False, model_local_path=target_path, direct_load=True, load_format="auto", weight_only_quant=False, test=self, user_set_data_type="float16", device_list=device_list, enable_sparsity_matmul=enable_sparsity_matmul) - - def test_inference_qwen2_models_int8_weight_only(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct-GPTQ-Int8"], init_quant=True, - weight_only_quant=True, test=self) - - def test_inference_qwen2_models_fp16(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self) - def test_inference_qwen2_models_fp16_cache_off(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self, user_runtime_config_dict={"enable_prefix_cache" : 0}) - - def test_inference_qwen2_models_fp16_in_memory(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self, - in_memory=True) - - def test_inference_qwen2_models_fp16_dynamic_iq_in_memory(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self, - weight_only_quant=True, quant_config={}) - def test_inference_codeqwen_models_fp16(self): - func_test_model_with_reference(similarity_test_cases["qwen/CodeQwen1.5-7B-Chat"], init_quant=False, test=self, in_memory=True, device_list=[0]) - - - def disable_test_inference_qwen2_72b_models_int8_no_pack(self): - model_path = "path/to/the/model/provided/by/yaoyang/in/nas" - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-72B-A8W8-PerChannel"], init_quant=True, ms_download=False, - model_local_path=model_path, direct_load=True, load_format="auto", user_set_data_type="float16") - def disable_test_inference_qwen2_72b_models_int8(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-72B-Instruct-GPTQ-Int8"], init_quant=True, - weight_only_quant=False, device_list=[0, 1], test=self) - - def disable_test_inference_llama3_with_fp16(self): - func_test_model_with_reference(self.similarity_test_cases["LLM-Research/Meta-Llama-3-8B"], test=self) - - def disable_test_inference_chatglm3_with_fp16(self): - func_test_model_with_reference(self.similarity_test_cases["ZhipuAI/chatglm3-6b"], test=self) - - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/python/gpu/test_08_long_text.py b/tests/python/gpu/test_08_long_text.py deleted file mode 100644 index 4e781ba6..00000000 --- a/tests/python/gpu/test_08_long_text.py +++ /dev/null @@ -1,62 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_08_long_text.py -''' -import os -import gc -import time -import unittest - -import modelscope - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice -from dashinfer.allspark.engine import RoPEScaleMethod -from dashinfer.allspark.prompt_utils import PromptTemplate -from test_utils import LevenshteinCompare, GenerateResultCompare -from dashinfer.allspark._allspark import AsStatus, GenerateRequestStatus -from test_util_infer import func_test_model_with_reference - - -input_text = """黄巾起义 群雄逐鹿 东汉末年,皇帝昏聩,宦官专权,民不聊生。爆发了张角领导的大规模农民起义——黄巾起义。乱世之中,一代英雄人物竞相涌现。 大将军何进在与十常侍的斗争中被杀,袁绍、曹操等大臣以平“十常侍之乱”为名冲入皇宫,诛杀宦官。在保护汉少帝的过程中,西凉刺史董卓引兵入关,随即掌控大权,废汉少帝,另立陈留王刘协为汉献帝。生性残暴的董卓倒行逆施,引发多方愤然。曹操假借圣旨之名,召集十八路诸侯联合讨伐董卓。虎牢关前,志在匡扶汉室的刘备、关羽、张飞三兄弟大战吕布。讨董盟军声威浩大,迫使董卓挟献帝逃至长安,最终被司徒王允设连环计除掉。 然而十八路诸侯各怀异心,联盟开始分崩离析。袁绍欲谋取长沙太守孙坚手中的传国玉玺,联合刘表将孙坚杀死。同时,袁绍又在界桥之战中击败公孙瓒,成为北方最强势力。此时的曹操也广招贤才,积聚实力。群雄逐鹿的雏形初成。 衣带秘诏 官渡之战 董卓死后,曹操“挟天子以令诸侯”,迎汉献帝于许昌建都,并运用军事和政治手段除去了袁术、张绣、吕布等人,展现了非凡的治国才能。汉献帝不甘于被曹操胁迫,联合董承、刘备等大臣,秘密颁布“衣带诏”意欲除掉曹操,却因事机不密反遭其害,致使董承等人被杀。刘备侥幸逃脱,前往河北依附袁绍,与关羽张飞兄弟分离。 在江东,孙坚之子孙策多年苦心经营,终于称霸江东六郡八十一州,人称“小霸王”。孙策亡故后,其弟孙权继业。孙权在周瑜等人扶持下,为吴国的建立积聚了强大的实力。 关羽身在曹营却心念故主,斩颜良诛文丑,千里走单骑,最终与刘备张飞在古城相会。 袁绍以“衣带诏”为名,率领大军讨伐曹操,双方于官渡展开艰苦鏖战。袁军虽然势大,但内部勾心斗角,于官渡之战被曹操击败。袁绍气病交加身亡,袁家兄弟又相互猜忌,最终被曹操设计除掉。曹操继而征伐乌桓一统北方,为此后魏国的建立奠定了坚实的基础。 荆襄风云 火烧赤壁 刘备在汝南战败,投奔荆州刘表。刘备三顾茅庐,请得足智多谋又心怀天下的诸葛亮辅佐。曹操统一北方后开始举兵南征,矛头直指荆州。此时刘表亡故,荆州落入曹操手中。刘备遭曹操沿途追杀,命悬一线,幸而有诸葛亮、赵云等人死命保全才得以存身。 面对曹操南征之势,刘备遣诸葛亮往江东与孙权结盟。诸葛亮舌战群儒、智激周瑜,最终促成孙刘联军。在诸葛亮、周瑜、庞统、黄盖等文臣武将的合作下,通过反间计、连环计、苦肉计等一系列有步骤、有计划的行动,孙刘联军在赤壁一战以弱胜强大破曹军,谱写了中国古代战争史上以少胜多的光辉篇章。 赤壁大战过后,刘备孙权转而争夺荆州。孙权遣鲁肃向刘备讨还荆州,刘备则在诸葛亮的劝谕下多次推辞。周瑜献计,骗刘备前往东吴成亲,进而将其扣留以换荆州。不料周瑜的计谋被诸葛亮屡屡识破,致使其“赔了夫人又折兵”。周瑜最终在诸葛亮的讥讽中呕血而亡,留下了“既生瑜,何生亮”的长叹。历经了半生的屈辱磨难之后,刘备真正拥有了自己的领地,为进军西川打下基础。 曹丕篡汉 三国鼎立 赤壁之后,曹操与孙权爆发濡须之战,杀得难分胜负。西凉马超起兵报仇被曹操平定,曹操进爵魏公、魏王。拥有荆州的刘备则历经一番争斗打败刘璋,以“凤雏”庞统之死为代价夺占西川,并趁曹军立足未稳夺得汉中,自封汉中王。后东吴与曹魏修好,孙权受封南昌侯。 坐镇荆州的关羽率军攻打曹魏,于罾口川水淹七军,威震天下。此时荆州防务空虚,被东吴都督吕蒙以白衣渡江之计袭击。进退失据的关羽败走麦城,兵败身亡。此时,曹操去世,其子曹丕继承魏王的爵位,进而逼迫汉献帝退位,自称魏帝。自此,大汉王朝不复存在。 曹丕篡汉后,刘备以“恢复汉室”为名在益州称帝,建立蜀汉政权;孙权则坐镇江东一方。至此,天下大势底定,三国鼎立局面形成。 称帝后的刘备,不顾劝阻东征吴国。出兵前夕,张飞亦死于非命。刘备痛心疾首,亲自领兵挥师东进,一路所向披靡。情急之中的孙权用人不疑,拜书生陆逊为大都督,终于在夷陵之战中火烧连营,击溃蜀军。刘备率败军撤至白帝城时病倒,并在临终前向诸葛亮托孤。 七擒孟获 六出祁山 蜀国元气大伤之时,曹丕联合东吴与蜀汉降将孟达、南蛮孟获等势力五路发兵进攻蜀国。诸葛亮临危不惧,派出马超、赵云等猛将把守关口,又派出李严、邓芝等人说服孟达和东吴,安居平五路。 为完成刘备匡扶汉室的遗愿,诸葛亮决计征伐曹魏。为保后方太平,诸葛亮亲领大军远征云南,以七擒七纵的大仁大智,平定南蛮孟获之乱。 后曹丕病逝,其子曹睿即位,司马懿从大将军曹爽手中夺得兵权。此后其子司马师、司马昭兄弟把持魏国大权,并另立曹髦为帝,司马家族自此权势滔天。司马昭进而公然弑君,杀死魏帝曹髦。司马昭之子司马炎篡位,改国号为晋,魏国灭亡。吴主孙皓最终也投降西晋,百余年的战乱从此平息,天下一统。为了突出人物形象和性格,作者采用了一些并不可靠的民间传闻野史加以编排发挥,如诸葛亮七擒孟获和空城计,在陈寿《三国志》中并无记载。但在《三国演义》中,这些故事情节的编排,对塑造诸葛亮的形象起到了画龙点睛的作用。有些情节在正史上虽有记载,但小说做了补充和合理想象。如刘备三顾茅庐是《三国演义》中十分重要的一个情节,为诸葛亮出场做了很好的烘托和铺垫。而在《三国志》中,仅有“由是先主遂诣亮,凡三往,乃见”一句。三是史籍中虽有其事,但为了艺术表现的需要,作者“狸猫换太子”,搬家移位。如张飞怒打势利、跋扁的督邮一节,极好地展示了张飞疾恶如仇的火爆性格,然而在《三国志》中,鞭打督邮的不是张飞,而是小说中谦恭忍让的刘备。 中国古代小说塑造人物形象的一般规律,正面人物往往得到美化,负面人物则每每遭到丑化,例如关羽,“身长九尺,髯长二尺;面如重枣,唇若涂脂;丹凤眼,卧蚕眉;相貌堂堂,威风凛凛”。其实,史书并没有关羽相貌的记载,其相貌是后人基于对关羽的高度认同而虚构的;负面人物往往形象举止猥琐、贼眉鼠眼、尖嘴猴腮,例如张松,《三国志》中只记载其“为人短小”,由于其卖主求荣,《三国演义》进行了进一步的丑化,“其人生得额钁头尖,鼻偃齿露,身短不满五尺”。以致曹操初见张松,因相貌的原因,甚为不悦。正面人物在《三国演义》中,可能只有庞统一个特例,魏晋时期的典籍均无关于他相貌的描写,甚至宋元“说话”以及元杂剧“三国戏”中,亦没有关于庞统相貌的描写。《三国演义》中将庞统的相貌描写得相当丑陋,成为他的一个突出的缺点,并一度严重影响到了他的仕途。帮我总结以上内容,不超过100字""" -output_text = """《三国演义》讲述了东汉末年至三国时期的历史故事,主要围绕黄巾起义、群雄逐鹿、官渡之战、赤壁大战、三国鼎立等关键事件展开。故事中,董卓倒行逆施引发诸侯联合讨伐,曹操崛起,刘备、关羽、张飞三兄弟大战吕布,最终曹操除掉董卓,统一北方。随后,刘备、孙权、曹操三分天下,形成三国鼎立的局面。刘备称帝建立蜀汉,孙权坐镇江的政治斗争和英雄豪杰的风采。<|im_end|>""" - -# input_text = """黄巾起义 群雄逐鹿 东汉末年,皇帝昏聩,宦官专权,民不聊生。爆发了张角领导的大规模农民起义——黄巾起义。乱世之中,一代英雄人物竞相涌现。 大将军何进在与十常侍的斗争中被杀,袁绍、曹操等大臣以平“十常侍之乱”为名冲入皇宫,诛杀宦官。在保护汉少帝的过程中,西凉刺史董卓引兵入关,随即掌控大权,废汉少帝,另立陈留王刘协为汉献帝。生性残暴的董卓倒行逆施,引发多方愤然。曹操假借圣旨之名,召集十八路诸侯联合讨伐董卓。虎牢关前,志在匡扶汉室的刘备、关羽、张飞三兄弟大战吕布。讨董盟军声威浩大,迫使董卓挟献帝逃至长安,最终被司徒王允设连环计除掉。 然而十八路诸侯各怀异心,联盟开始分崩离析。袁绍欲谋取长沙太守孙坚手中的传国玉玺,联合刘表将孙坚杀死。同时,袁绍又在界桥之战中击败公孙瓒,成为北方最强势力。此时的曹操也广招贤才,积聚实力。群雄逐鹿的雏形初成。 衣带秘诏 官渡之战 董卓死后,曹操“挟天子以令诸侯”,迎汉献帝于许昌建都,并运用军事和政治手段除去了袁术、张绣、吕布等人,展现了非凡的治国才能。汉献帝不甘于被曹操胁迫,联合董承、刘备等大臣,秘密颁布“衣带诏”意欲除掉曹操,却因事机不密反遭其害,致使董承等人被杀。刘备侥幸逃脱,前往河北依附袁绍,与关羽张飞兄弟分离。 在江东,孙坚之子孙策多年苦心经营,终于称霸江东六郡八十一州,人称“小霸王”。孙策亡故后,其弟孙权继业。孙权在周瑜等人扶持下,为吴国的建立积聚了强大的实力。 关羽身在曹营却心念故主,斩颜良诛文丑,千里走单骑,最终与刘备张飞在古城相会。 袁绍以“衣带诏”为名,率领大军讨伐曹操,双方于官渡展开艰苦鏖战。袁军虽然势大,但内部勾心斗角,于官渡之战被曹操击败。袁绍气病交加身亡,袁家兄弟又相互猜忌,最终被曹操设计除掉。曹操继而征伐乌桓一统北方,为此后魏国的建立奠定了坚实的基础。 荆襄风云 火烧赤壁 刘备在汝南战败,投奔荆州刘表。刘备三顾茅庐,请得足智多谋又心怀天下的诸葛亮辅佐。曹操统一北方后开始举兵南征,矛头直指荆州。此时刘表亡故,荆州落入曹操手中。刘备遭曹操沿途追杀,命悬一线,幸而有诸葛亮、赵云等人死命保全才得以存身。 面对曹操南征之势,刘备遣诸葛亮往江东与孙权结盟。诸葛亮舌战群儒、智激周瑜,最终促成孙刘联军。在诸葛亮、周瑜、庞统、黄盖等文臣武将的合作下,通过反间计、连环计、苦肉计等一系列有步骤、有计划的行动,孙刘联军在赤壁一战以弱胜强大破曹军,谱写了中国古代战争史上以少胜多的光辉篇章。 赤壁大战过后,刘备孙权转而争夺荆州。孙权遣鲁肃向刘备讨还荆州,刘备则在诸葛亮的劝谕下多次推辞。周瑜献计,骗刘备前往东吴成亲,进而将其扣留以换荆州。不料周瑜的计谋被诸葛亮屡屡识破,致使其“赔了夫人又折兵”。周瑜最终在诸葛亮的讥讽中呕血而亡,留下了“既生瑜,何生亮”的长叹。历经了半生的屈辱磨难之后,刘备真正拥有了自己的领地,为进军西川打下基础。 曹丕篡汉 三国鼎立 赤壁之后,曹操与孙权爆发濡须之战,杀得难分胜负。西凉马超起兵报仇被曹操平定,曹操进爵魏公、魏王。拥有荆州的刘备则历经一番争斗打败刘璋,以“凤雏”庞统之死为代价夺占西川,并趁曹军立足未稳夺得汉中,自封汉中王。后东吴与曹魏修好,孙权受封南昌侯。 坐镇荆州的关羽率军攻打曹魏,于罾口川水淹七军,威震天下。此时荆州防务空虚,被东吴都督吕蒙以白衣渡江之计袭击。进退失据的关羽败走麦城,兵败身亡。此时,曹操去世,其子曹丕继承魏王的爵位,进而逼迫汉献帝退位,自称魏帝。自此,大汉王朝不复存在。 曹丕篡汉后,刘备以“恢复汉室”为名在益州称帝,建立蜀汉政权;孙权则坐镇江东一方。至此,天下大势底定,三国鼎立局面形成。 称帝后的刘备,不顾劝阻东征吴国。出兵前夕,张飞亦死于非命。刘备痛心疾首,亲自领兵挥师东进,一路所向披靡。情急之中的孙权用人不疑,拜书生陆逊为大都督,终于在夷陵之战中火烧连营,击溃蜀军。刘备率败军撤至白帝城时病倒,并在临终前向诸葛亮托孤。 七擒孟获 六出祁山 蜀国元气大伤之时,曹丕联合东吴与蜀汉降将孟达、南蛮孟获等势力五路发兵进攻蜀国。诸葛亮临危不惧,派出马超、赵云等猛将把守关口,又派出李严、邓芝等人说服孟达和东吴,安居平五路。 为完成刘备匡扶汉室的遗愿,诸葛亮决计征伐曹魏。为保后方太平,诸葛亮亲领大军远征云南,以七擒七纵的大仁大智,平定南蛮孟获之乱。 后曹丕病逝,其子曹睿即位,司马懿从大将军曹爽手中夺得兵权。此后其子司马师、司马昭兄弟把持魏国大权,并另立曹髦为帝,司马家族自此权势滔天。司马昭进而公然弑君,杀死魏帝曹髦。司马昭之子司马炎篡位,改国号为晋,魏国灭亡。吴主孙皓最终也投降西晋,百余年的战乱从此平息,天下一统。""" -# output_text = """三国时期是中国历史上的一个动荡时期,也是英雄辈出的时代。这一时期,中国经历了从汉朝衰落到三国鼎立再到西晋统一的历史变迁。 - -# 三国时期的主要特点是: - -# 1. **政治动荡**:汉朝末年,宦官专权,外戚干政,导致政治腐败和社会动荡。最终,汉朝灭亡,三国鼎立的局面形成。 - -# 2. **军事冲突**:三国时期,各政权之间频繁发生军事冲突,如赤壁之战、夷陵之战等,这些战役不仅影响了三国的政治格局,也深刻地影响了中国历史的发展进程。 - -# 3. **文化繁荣**:尽管三国时期政治动荡,但这一时期的文化发展却异常繁荣。文学、艺术、哲学等领域都出现了许多杰出的代表人物,他们的作品至今仍被广泛传颂和研究。 - -# 总之,三国时期是中国历史上一个充满变数和挑战的时期。它不仅见证了汉朝的衰落和三国鼎立局面的形成,而且在政治、军事、文化等多个领域都留下了深刻的印记。<|im_end|>""" - -similarity_test_cases = { - "qwen/Qwen2-7B-Instruct": - {"model_name": "qwen/Qwen2-7B-Instruct", "input": [input_text], - "reference": [output_text], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.1 - }, -} - - -class ModelSimilarityTest(unittest.TestCase): - def setUp(self): - self.similarity_test_cases = similarity_test_cases - self.engine = allspark.Engine() - # 创建模型实例 - - def tearDown(self): - self.engine = None - gc.collect() - - def test_inference_qwen2_models_fp16(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self, set_engine_max_length=16000, device_list=[0, 1]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/python/gpu/test_09_qwen2_repeat.py b/tests/python/gpu/test_09_qwen2_repeat.py deleted file mode 100644 index b823a1f6..00000000 --- a/tests/python/gpu/test_09_qwen2_repeat.py +++ /dev/null @@ -1,87 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_09_qwen2_repeat.py -''' -# -*- coding: utf-8 -*- - -import os -import gc -import time -import unittest - -import modelscope - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice, RequestStatInfo -from dashinfer.allspark.engine import RoPEScaleMethod -from dashinfer.allspark.prompt_utils import PromptTemplate -from test_utils import LevenshteinCompare, JaccardCompare, GenerateResultCompare -from dashinfer.allspark._allspark import AsStatus, GenerateRequestStatus -from test_util_infer import func_test_model_with_reference - - -long_input_10k="""您现在是金融领域经验丰富的风险审核人员。以下内容是贷款人的一些信息,列出具体的风险点:\n{'是否命中融360违法前科': '0', '是否命中融360吸毒': '0', '天机自然人是否有形式案件记录': '0', '是否命中融360涉黄吸毒': '0', '是否命中融360公安一�﨧䥈駺 '0', '是否命中融360逃犯及历史逃犯': '0', '按身份证号查询,近3个月在银行机构有申请记录月份数': '2.0', '按身份证号查询,近3个月在银行机构最大月申请次数': '1.0', '按身份证号查询,近3个月申请信用卡(类信用卡)的机构数': '2.0', '按身份证号查询,近3个月申请信用卡(类信用卡)的次数': '2.0', '按身份证号查询,近3个月在银行机构-传统银行申请次数': '2.0', '按身份证号查询,近3个月在银行机构-传统银行申请机构数': '2.0', '按身份证号查询,近3个月在银行机构平均每月申请次数(有申请月份平均)': '1.0', '按身份证号查询,近3个月在银行机构申请机构数': '2.0', '按身份证号查询,近3个月在银行机构申请次数': '2.0', '按身份证号查询,近3个月在本机构(本机构为银行)的申请次数': '0.0', '按身份证号查询,近3个月在银行机构申请最小间隔天数': '22.0', '按身份诀᥏禟娯⯼쨿᳤誦숥쨩㶨ጦ캦䳨﷦쀥䧩紩ꔥ䩦尧: '22.0', '按身份证号查询,近3个月在银行机构周末申请机构数': '1.0', '按身份证号查询,近3个月在银行机构周末申请次数': '1.0', '按身份证号查询,近3个月在银行机构夜间申请次数': '0.0', '按身份证号查询,近3个月在银行机构最小月申请次数': '0.0', '按身份证号查询,近3个月在银行机构夜间申请机构数': '0.0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之30账户数': '5.0', '信用卡审批近6个月查询,月查询次数大于等于二的次数': '0', '贷款审批近6个月查询,月查询次数大于等于二的次数': '0', '近24个月信用卡获贷比': '100', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之100账户数': '0.0', '近24个月发放的人民币信用卡最高授信额度': '76000.0', '贷记卡近60个月出现逾期1期或以上的逾期次数': '2.0', '近60个月发放的活跃贷记卡发卡机构数': '3.0', '未销户信用卡有近6个月的平均使用额度超过总授信额度的百分之80账户数': '1.0', '近6个月发放的贷款逾期三期及以上的最大次数': '0.0', '近24个月最大月查询机构数': '3', '担保资格审查近6个月月均查询机构数_V2': '0.0', '融资租赁近6个月最小月查询次数': '0', '近24个月查询总次数': '13', '近6个月发放的信用卡出现逾期1期或以上的逾期次数': '0.0', '近6个月查询机构数': '2', '贷记卡近24个月出现逾期1期或以上的逾期次数': '1.0', '信用卡有近6个月的平均使用额度超过总授信额度的百分之90账户数': '0.0', '近24个月发放的信用卡出现逾期2期或以上的最大逾期次数': '0.0', '近60个月发放的信用卡出现逾期1期或以上的最大逾期次数': '0.0', '近24个月发放的人民币信用卡汇总授信额度': '149000.0', '信用卡审批近6个月最小月查询机构数': '1', '贷怀쾥鹨个月查询机构数': '1', '活跃信用卡近6个月出现逾期1期或以上的逾期次数': '0.0', '信用卡近60个月单笔最小逾期率': '0.03', '近6个月获贷比': '50', '融资租赁近6个月月均查询次数': '0.0', '近6个月发放的人民币信用卡总额度使用比率': '0.01', '近24个月发放的人民币活跃贷记卡最高已使用额度': '19236.0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之50账户数': '5.0', '融资租赁近24个月查询,月查询次数大于等于三的次数': '0', '近6个月发放的人民币活跃贷记卡笔数': '1.0', '近60个月发放的贷款逾期两期及以上的次数': '0.0', '近24个月发放的贷款逾期两期及以上的最大次数': '0.0', '近6个月发放的人民币活跃贷记卡总额度使用比率': '0.01', '近60个月发放的贷款最大逾期次数': '1.0', '贷款近6个月逾期两期及以上的次数': '0.0', '近60个月发放的信用卡的逾期期数最大倀༧: '0.0', '贷款审批近24个月月均查询机构数_V2': '1.333', '融资租赁近24个月查询,月查询次数大于等于一的次数': '0', '近24个月最小月查询机构数': '1', '近6个月最小月查询机构数': '1', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之70账户数': '1.0', '融资租赁近6个月查询,月查询次数大于等于一的次数': '0', '信用卡有近6个月的平均使用额度超过总授信额度的百分之50账户数': '5.0', '近6个月发放的信用卡出现逾期1期或以上的最大逾期次数': '0.0', '近60个月发放的人民币信用卡汇总已使用额度': '42122.0', '近24个月查询,月查询次数大于等于一的次数': '7', '近6个月发放的人民币信用卡汇总授信额度': '76000.0', '近6个月查询,月查询次数大于等于二的次数': '0', '近24个月发放的人民币活跃贷记卡最高授信额度': '76000.0', '近6个月最小月查询次数': '1', '贷款近24个月逾最怜ᦕ৺ '0.0', '未结清非银行贷款近24个月逾期次数': '0.0', '信用卡有近6个月的平均使用额度超过总授信额度的百分之30账户数': '5.0', '信用卡近24个月逾期,后一个月比前一个月增加的次数': '1.0', '信用卡近24个月出现逾期1期或以上的最大逾期次数': '1.0', '近60个月发放的人民币活跃贷记卡最高已使用额度': '19236.0', '融资租赁近24个月月均查询机构数': '0.0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之90账户数': '0.0', '信用卡近24个月逾期金额最小值': '1936.0', '汇总信息贷款最近6个月平均应还款': '0.0', '近60个月发放的信用卡发卡机构数': '3.0', '近24个月发放的人民币活跃贷记卡汇总已使用额度': '36064.0', '近24个月查询,月查询次数大于等于二的次数': '4', '信用卡近60个月逾期7期的次数': '0.0', '近60个月发放的信用卡出现逾期3期或以上的逾期次数': '0.0', '近60一ꦜ襏ᦔ䨴禬妜ਿᤸবᩀ嘆�꧚䦜褻�৺ '38.0', '信用卡审批近6个月查询机构数': '1', '近60月车贷笔数': '0.0', '信用卡近6个月出现逾期3期或以上的最大逾期次数': '0.0', '汇总信息未销户贷记卡最近6个月平均使用额': '87198.0', '近6个月发放的信用卡出现逾期3期或以上的逾期次数': '0.0', '贷款审批近24个月最大月查询机构数': '2', '近6个月信用卡获贷比': '100', '近24个月月均查询机构数': '1.571', '融资租赁近6个月月均查询机构数': '0.0', '近60个月发放的贷款逾期三期及以上的最大次数': '0.0', '融资租赁近24个月月均查询次数': '0.0', '近6个月发放的活跃贷款从未逾期的笔数': '0.0', '融资租赁近6个月最大月查询次数': '0', '信用卡近6个月出现逾期3期或以上的逾期次数': '0.0', '贷款近60个月逾期,后一个月比前一个月减少的次数': '2.0', '融资租赁近24个月最大月查询次数': '0', '近6个月发放的活跃贷记卡发卡机构数': '1.0', '近60个月发放的人民币活跃贷记卡笔数': '5.0', '近24月融资租赁贷款笔数': '0.0', '近60个月发放的人民币活跃1信用卡最大已使用额度比例': '0.8', '近60个月发放的贷款逾期月份数': '1.0', '信用卡近60个月单笔最大逾期率': '0.03', '近24个月发放的人民币活跃贷记卡最大已使用额度比例': '0.8', '近60个月发放的信用卡出现逾期3期或以上的最大逾期次数': '0.0', '近24个月发放的信用卡从未逾期的账户数': '3.0', '贷款审批近6个月最小月查询次数': '1', '近6个月发放的信用卡逾期月份数': '0.0', '近60个月发放的人民币信用卡最高授信额度': '76000.0', '额度审批近6个月查询机构数': '0', '近6个月查询总次数': '2', '近6个月发放的人民币活跃贷记卡汇总已使用额度': '900.0', '近24个月发放的人民币信用卡汇总已使用额度': '36064.0', '信用卡近24个月单笔最大逾期率': '0.04', '近6个月发放的人民币信用卡最大已使用额度比例': '0.01', '信用卡近24个月逾期7期的次数': '0.0', '贷款近60个月逾期金额最小值': '1189.0', '贷记卡有近6个月的平均使用额度超过总授信额度的百分之40账户数': '5.0', '贷款审批近24个月最小月查询次数': '1', '融资租赁近6个月查询,月查询次数大于等于三的次数': '0', '近60个月发放的人民币活跃贷记卡最小已使用额度比例': '0.01', '近60个月发放的信用卡的逾期账户数': '0.0', '信用卡审批近24个月月均查询机构数_V2': '1.5', '信用卡近24个月出现逾期2期或以上的逾期次数': '0.0', '贷款近6个月逾期次数': '0.0', '近60月融资租赁贷款笔数': '0.0', '近24个月发放的人民币信用卡最高已使用额度': '19236.0', '信用卡近24个月逾期金额最大值': '1936.0', '贷款审批近6个月最小月查询机构数': '1', '贷款近24个月最大逾期期数': '0.0', '活跃信用卡有迀ᶤ誦숧ꄥ鳥퇤�䨩❥ꦨ慨໦❥ꦧꄧ龥膤鋸0账户数': '1.0', '近6个月发放的人民币活跃贷记卡最高已使用额度': '900.0', '贷款近24个月最大逾期次数': '0.0', '贷记卡近24个月出现逾期1期或以上的最大逾期次数': '1.0', '活跃1信用卡近60个月单笔最小逾期率': '0.03', '信用卡审批近24个月最小月查询次数': '1', '近24月车贷笔数': '0.0', '近60个月发放的贷款逾期,后一个月比前一个月增加的次数': '1.0', '贷款近6个月从未逾期的笔数': '11.0', '近6个月最大月查询机构数': '1', '信用卡近24个月出现逾期的账户数': '1.0', '信用卡审批近24个月最小月查询机构数': '1', '信用卡近6个月从未逾期的账户数': '12.0', '贷款审批近24个月查询机构数': '3', '近6个月发放的贷款逾期月份数': '0.0', '贷款近24个月逾期,后一个月比前一个月减少的次数': '0.0', '近60个月发放的信用卡出现逾期1期或以上的逾期次怀尧: '0.0', '贷款审批近24个月查询总次数': '10', '近24个月最大月查询次数': '3', '近6个月发放的人民币活跃贷记卡最高授信额度': '76000.0', '近6个月查询,月查询次数大于等于一的次数': '2', '近6个月月均查询机构数': '1.0', '近6个月发放的信用卡从未逾期的账户数': '1.0', '担保资格审查近24个月月均查询机构数_V2': '0.0', '信用卡审批近6个月最小月查询次数': '1', '信用卡审批近24个月查询总次数': '3', '贷款近6个月逾期三期及以上的最大次数': '0.0', '额度审批近6个月月均查询机构数_V2': '0.0', '近60个月发放的贷款逾期次数': '1.0', '近24个月发放的人民币活跃贷记卡笔数': '3.0', '信用卡近6个月逾期7期的次数': '0.0', '近60个月发放的贷款逾期三期及以上的次数': '0.0', '贷款近24个月逾期月份数': '0.0', '信用卡有近6个月的平均使用额度超过总授信额度的百分之80账户数': '1.0', '贷记卡近6个最襇꧎ੀ栗䨴榈禕৺ '0.0', '信用卡近6个月逾期,后一个月比前一个月减少的次数': '0.0', '近24个月发放的人民币信用卡最大已使用额度比例': '0.8', '信用卡近24个月出现逾期1期或以上的逾期次数': '1.0', '近60个月发放的人民币活跃贷记卡汇总授信额度': '219000.0', '近24个月发放的贷款逾期三期及以上的次数': '0.0', '近24个月月均查询次数': '1.857', '贷款审批近24个月月均查询次数': '1.667', '信用卡近24个月逾期金额汇总值': '1936.0', '融资租赁近24个月月均查询机构数_V2': '0.0', '融资租赁近24个月查询,月查询次数大于等于二的次数': '0', '近6个月发放的人民币信用卡笔数': '1.0', '近60个月发放的贷款最大逾期率': '0.03', '近24个月发放的人民币信用卡总额度使用比率': '0.24', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之20账户数': '5.0', '信用卡近24个月出现逾期3期或以一꧚䩀怜ᦕ৺ '0.0', '融资租赁近6个月月均查询机构数_V2': '0.0', '贷款近60个月逾期总金额': '60976.0', '近24个月发放的人民币信用卡笔数': '3.0', '贷款近24个月有逾期的笔数': '0.0', '担保资格审查近6个月月均查询机构数': '0.0', '贷款近6个月最大逾期次数': '0.0', '近60月个人经营性贷款笔数': '0.0', '近6个月发放的贷款最大逾期期数': '0.0', '近6个月最大月查询次数': '1', '信用卡近24个月逾期月份数': '1.0', '信用卡审批近6个月查询,月查询次数大于等于三的次数': '0', '近60个月发放的信用卡出现逾期2期或以上的最大逾期次数': '0.0', '信用卡审批近6个月最大月查询次数': '1', '近60个月发放的人民币信用卡最大已使用额度比例': '0.8', '近6个月查询,月查询次数大于等于三的次数': '0', '贷款近6个月逾期,后一个月比前一个月增加的次数': '0.0', '信用卡审批近24个月月均查询机构数': '5.5', '近60个月发放的贷款逾期两期及以上的最大次数': '0.0', '近60个月发放的人民币活跃信用卡总额度使用比率': '0.19', '近24个月发放的贷款逾期三期及以上的最大次数': '0.0', '近24个月发放的信用卡发卡机构数': '2.0', '近60个月发放的贷款月均逾期金额': '1189.0', '近6个月发放的贷款逾期,后一个月比前一个月增加的次数': '0.0', '近24月个人经营性贷款笔数': '0.0', '信用卡审批近6个月月均查询机构数': '2.0', '近24个月发放的贷款有逾期的笔数': '0.0', '融资租赁近6个月最小月查询机构数': '0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之40账户数': '5.0', '近24个月发放的信用卡出现逾期3期或以上的最大逾期次数': '0.0', '信用卡审批近24个月查询,月查询次数大于等于三的次数': '0', '贷款近6个月有逾期的笔数': '0.0', '近6个月发放的信用卡的逾期账户数': '0.0', '信用卡审批近6个月查询总次数': '1', '贷款近6个月逾期七期的次数': '0.0', '信用卡近24个月出现逾期3期或以上的最大逾期次数': '0.0', '近6个月发放的贷款逾期三期及以上的次数': '0.0', '近60个月发放的贷款逾期总金额': '1189.0', '信用卡审批近24个月最大月查询次数': '2', '信用卡近24个月出现逾期2期或以上的最大逾期次数': '0.0', '近60个月发放的贷款逾期金额最大值': '1189.0', '近6个月发放的信用卡发卡机构数': '1.0', '近60个月发放的贷款逾期金额最小值': '1189.0', '贷款近6个月逾期,后一个月比前一个月减少的次数': '0.0', '贷款近60个月逾期金额最大值': '59787.0', '融资租赁近6个月查询总次数': '0', '近24个月发放的人民币活跃贷记卡总额度使用比率': '0.24', '近24个月查询,月查询次数大于等于三的次数': '2', '近24个月发放的贷款最大逾期期数': '0.0', '近6个月发放的贷款逾期次数': '0.0', '近60一ꦜ襏ᦔ䤺ꦰ᥸ᦴ먷㨴種्ᦀ멢�椽倫覯䧎秺 '0.19', '信用卡近24个月从未逾期的账户数': '11.0', '活跃贷记卡近6个月出现逾期1期或以上的逾期次数': '0.0', '近24个月发放的信用卡出现逾期2期或以上的逾期次数': '0.0', '近6个月发放的信用卡出现逾期2期或以上的最大逾期次数': '0.0', '近24个月发放的人民币信用卡最小已使用额度比例': '0.01', '近60个月发放的人民币信用卡最高已使用额度': '19236.0', '贷款审批近6个月最大月查询机构数': '1', '近60个月发放的人民币信用卡最小已使用额度比例': '0.01', '贷款近60个月最大逾期率': '0.33', '贷款近24个月逾期三期及以上的次数': '0.0', '担保资格审查近6个月查询机构数': '0', '贷款审批近6个月查询,月查询次数大于等于一的次数': '1', '近60个月发放的贷款最大逾期期数': '1.0', '额度审批近24个月月均查询机构数_V2': '0.0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之60账户数': '3.0', '近6个月发放的人民币活跃贷记卡最大已使用额度比例': '0.01', '近60个月发放的信用卡从未逾期的账户数': '5.0', '融资租赁近24个月最小月查询次数': '0', '近24个月发放的信用卡逾期月份数': '0.0', '融资租赁近24个月最大月查询机构数': '0', '信用卡审批近24个月月均查询次数': '1.5', '贷款审批近24个月最大月查询次数': '3', '融资租赁近6个月最大月查询机构数': '0', '贷款审批近6个月最大月查询次数': '1', '近60个月发放的贷款有逾期的笔数': '1.0', '近6个月发放的贷款逾期,后一个月比前一个月减少的次数': '0.0', '贷款审批近6个月月均查询机构数': '2.0', '信用卡审批近6个月月均查询机构数_V2': '1.0', '融资租赁近24个月最小月查询机构数': '0', '信用卡审批近6个月查询,月查询次数大于等于一的次数': '1', '信用卡近6个月出现逾怀쟱期或以上的最大逾期次数': '0.0', '近6个月发放的人民币活跃贷记卡最小已使用额度比例': '0.01', '近6个月发放的贷款有逾期的笔数': '0.0', '近24个月发放的贷款逾期,后一个月比前一个月增加的次数': '0.0', '近24个月查询机构数': '5', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之80账户数': '1.0', '未结清贷款近6个月逾期月份数': '0.0', '近6个月发放的人民币信用卡最小已使用额度比例': '0.01', '贷款近24个月逾期两期及以上的次数': '0.0', '信用卡近24个月最大逾期期数': '1.0', '贷记卡近24个月最大逾期期数': '1.0', '近6个月发放的信用卡出现逾期3期或以上的最大逾期次数': '0.0', '贷款审批近24个月查询,月查询次数大于等于二的次数': '3', '信用卡近6个月出现逾期的账户数': '0.0', '贷款近6个月最大逾期期数': '0.0', '贷款近60个月逾期,后一个月比前一个月增加的次敀৺ '2.0', '贷记卡有近6个月的平均使用额度超过总授信额度的百分之70账户数': '1.0', '信用卡近24个月逾期,后一个月比前一个月减少的次数': '1.0', '信用卡近60个月出现逾期1期或以上的最大逾期次数': '2.0', '近6个月发放的人民币活跃贷记卡汇总授信额度': '76000.0', '近60个月发放的人民币信用卡笔数': '5.0', '贷款近6个月逾期两期及以上的最大次数': '0.0', '近24个月发放的信用卡出现逾期3期或以上的逾期次数': '0.0', '近24个月发放的信用卡出现逾期1期或以上的最大逾期次数': '0.0', '贷款近6个月逾期三期及以上的次数': '0.0', '近24个月发放的贷款逾期,后一个月比前一个月减少的次数': '0.0', '近60个月发放的贷款从未逾期的笔数': '8.0', '贷款近24个月逾期两期及以上的最大次数': '0.0', '信用卡有近6个月的平均使用额度超过总授信额度的百分之100账户数': '0.0', '信用卡近24个月内最近一次逾期距今月份数': '7.0', '近6个月月均查询次数': '1.0', '近60个月发放的人民币活跃贷记卡最大已使用额度比例': '0.8', '近6个月发放的人民币信用卡最高授信额度': '76000.0', '近24个月发放的贷款逾期月份数': '0.0', '近24个月发放的活跃贷记卡发卡机构数': '2.0', '信用卡审批近24个月查询,月查询次数大于等于二的次数': '1', '近6个月发放的人民币信用卡汇总已使用额度': '900.0', '信用卡审批近24个月查询机构数': '2', '近24个月贷款获贷比': '20', '贷款审批近6个月月均查询机构数_V2': '1.0', '近6个月发放的信用卡出现逾期2期或以上的逾期次数': '0.0', '近60个月发放的活跃贷款逾期月份数': '0.0', '近60个月发放的人民币信用卡总额度使用比率': '0.19', '近60个月发放的人民币活跃贷记卡汇总已使用额度': '42122.0', '近24个月最小月查询次数': '1', '近6个月发放的贷款最大逾期次数': '0.0', '贀禬ᶰ个月最小逾期率': '0.03', '近24个月发放的贷款逾期次数': '0.0', '信用卡近6个月最大逾期期数': '0.0', '融资租赁近24个月查询总次数': '0', '信用卡审批近24个月最大月查询机构数': '2', '近24个月发放的人民币活跃贷记卡汇总授信额度': '149000.0', '信用卡审批近24个月查询,月查询次数大于等于一的次数': '2', '近6个月发放的贷款从未逾期的笔数': '0.0', '信用卡近6个月出现逾期1期或以上的逾期次数': '0.0', '近6个月发放的信用卡的逾期期数最大值': '0.0', '贷款近24个月逾期三期及以上的最大次数': '0.0', '信用卡近6个月逾期月份数': '0.0', '近24个月发放的信用卡出现逾期1期或以上的逾期次数': '0.0', '贷款审批近6个月查询总次数': '1', '贷款审批近24个月查询,月查询次数大于等于三的次数': '1', '近6个月发放的贷款逾期两期及以上的最大次数': '0.0', '信用卡近6个月出现逾期2期或以上的最大逾期次数': '0.0', '贷款近60个月月均逾期金额': '30488.0', '信用卡近24个月月均逾期金额': '1936.0', '贷款审批近24个月最小月查询机构数': '1', '贷款近60个月逾期七期的次数': '0.0', '近60个月发放的信用卡逾期月份数': '0.0', '信用卡审批近6个月月均查询次数': '1.0', '近24个月发放的贷款从未逾期的笔数': '2.0', '融资租赁近6个月查询,月查询次数大于等于二的次数': '0', '贷款审批近24个月查询,月查询次数大于等于一的次数': '6', '贷款审批近6个月月均查询次数': '1.0', '贷记卡近6个月出现逾期1期或以上的最大逾期次数': '0.0', '信用卡近24个月单笔最小逾期率': '0.04', '近24个月发放的贷款最大逾期次数': '0.0', '近60个月发放的信用卡出现逾期2期或以上的逾期次数': '0.0', '活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分之10账户数': '6.0', '贷款近24个月从未逾期的笔敀৺ '11.0', '近24个月发放的贷款逾期两期及以上的次数': '0.0', '信用卡近6个月逾期,后一个月比前一个月增加的次数': '0.0', '近60个月发放的人民币活跃贷记卡最高授信额度': '76000.0', '近60个月发放的贷款最小逾期率': '0.03', '近60个月发放的非银行贷款逾期总金额': '1189.0', '贷款审批近6个月查询,月查询次数大于等于三的次数': '0', '近60个月发放的贷款逾期,后一个月比前一个月减少的次数': '1.0', '信用卡近6个月出现逾期2期或以上的逾期次数': '0.0', '贷款近6个月逾期月份数': '0.0', '近6个月发放的人民币信用卡最高已使用额度': '900.0', '近24个月发放的信用卡的逾期账户数': '0.0', '贷款审批近24个月月均查询机构数': '1.833', '近24个月获贷比': '38', '近24个月发放的人民币活跃贷记卡最小已使用额度比例': '0.01', '近60个月发放的人民币信用卡汇总授信额度': '219000.0', '近6个月发放的贷款逾期两期及以上的次数': '0.0', '贷款近24个月逾期,后一个月比前一个月增加的次数': '0.0', '近24个月发放的信用卡的逾期期数最大值': '0.0', '信用卡审批近6个月最大月查询机构数': '1', '融资租赁近24个月查询机构数': '0', '融资租赁近6个月查询机构数': '0', '贷款近24个月逾期七期的次数': '0.0', '近1月_客户经理_去重数与近1年_客户经理_去重数_比': '0.3333333333333333', '近3月_通过_身份证_去重数': '5.0', '近1月_名字_去重数': '3.0', '近1月_订单_去重数': '3.0', '近3月_拒绝_客户经理_去重数': '0.0', '近3天_客户经理_去重数与近1月_客户经理_去重数_比': '1.0', '近1月_名字_去重数与近1年_名字_去重数_比': '0.6', '近1月_订单_去重数与近1年_订单_去重数_比': '0.16666666666666666', '近7天_订单_去重数': '3.0', '近3月_通过_客户经理_去重数': '2.0', '近1月_手机号_去重数与近1年_手机号_去重数_比': '0.5', '年龄(根据身份证号得到)': '40.0', '近3月_拒绝_名字_去重数': '0.0', '近1月_客户经理_去重数': '1.0', '近3月_通过_订单_去重数': '7.0', '近3月_订单_去重数': '7.0', '近7天_手机号_去重数': '3.0', '近1年_拒绝_客户经理_去重数': '0.0', '近1年_经销商_去重数': '1.0', '近1年_客户经理_去重数': '3.0', '近3月_名字_去重数': '5.0', '近3月_拒绝_订单_去重数': '0.0', '近1月_拒绝_订单_去重数': '0.0', '度小满银行分v11': '605', '个人近3年发生申请破产清算笔数': '0.0', '近5年诉讼地位为被告的委托合同纠纷个数': '0.0', '个人近3年发生清算责任纠纷笔数': '0.0', '个人近5年发生债权转让合同纠纷笔数': '0.0', '个人近3年发生追收非正常收入纠纷笔数': '0.0', '个人近所有发生机动车交通事故责任纠纷诉讼金额': '0.0', '个人近3年发生失踪人债务支付纠纷笔数': '0.0', '个人近3年发生管理人责任纠纷笔数': '0.0', '个人耀年发生债权人代位权纠纷笔数': '0.0', '个人近3年发生请求撤销个别清偿行为纠纷笔数': '0.0', '个人近3年发生质押合同纠纷笔数': '0.0', '个人近3年发生转质权纠纷笔数': '0.0', '个人近5年发生小额借款合同纠纷笔数': '0.0', '个人近3年发生证券虚假陈述责任纠纷笔数': '0.0', '个人近5年发生借款合同纠纷笔数': '0.0', '个人近3年发生期货保证合约纠纷笔数': '0.0', '个人近5年发生信用卡纠纷笔数': '0.0', '个人近3年发生仓单质权纠纷笔数': '0.0', '个人近3年发生债权债务概括转移合同纠纷笔数': '0.0', '近5年诉讼地位为被告的居间合同纠纷个数': '0.0', '个人近3年发生追收抽逃出资纠纷笔数': '0.0', '个人近5年发生金融借款合同纠纷笔数': '0.0', '个人近3年发生公益信托纠纷笔数': '0.0', '近5年诉讼地位为被告的运输合同纠纷个数': '0.0', '个人近3年发生同业拆借纠纷笔数': '0.0', '个人近3年发瀀䟦쀩똩❨䨦탧ꠧ귧씦尧: '0.0', '个人近3年发生期货透支交易纠纷笔数': '0.0', '个人近3年发生期货强行平仓纠纷笔数': '0.0', '个人近3年发生公司解散纠纷笔数': '0.0', '个人近3年发生建设工程价款优先受偿权纠纷笔数': '0.0', '个人近3年发生破产撤销权纠纷笔数': '0.0', '个人近3年发生操纵期货交易市场责任纠纷笔数': '0.0', '个人近3年发生别除权纠纷笔数': '0.0', '个人近3年发生追偿权纠纷笔数': '0.0'} -""" - -long_output_10k = """ -根据提供的贷款人信息,我们可以识别出以下几个主要风险点: - -1. **频繁申请记录**:近3个月在银行机构有申请记录月份数为2.0,表明贷款人在短时间内有多次申请记录,这可能表示其资金需求紧迫或者信用状况不稳定。 - -2. **高申请频率**:近3个月在银行机构最大月申请次数为1.0,虽然不是非常高,但频繁的申请可能反映贷款人的信用行为不规律。 - -3. **多机构申请**:近3个月在不同银行机构和传统金融机构的申请次数和机构数较多,这可能意味着贷款人正在尝试从多个渠道获得资金,增加了违约风险。 - -4. **高使用额度**:活跃贷记卡有近6个月的平均使用额度超过总授信额度的百分比较高(如超过30%、50%、70%、90%),这可能表明贷款人的财务压力较大,还款能力存在不确定性。 - -5. **逾期记录**:近60个月发放的贷款逾期次数为1.0,信用卡逾期记录也存在,这直接反映了贷款人的还款能力存在问题。 - -6. **高逾期金额**:近60个月发放的贷款逾期金额最大值为1189.0,信用卡逾期金额最大值为1936.0,高额的逾期金额显示贷款人过去存在严重的还款问题。 - -7. **高查询次数**:近6个月查询总次数为2,近24个月查询总次数为13,频繁的信用查询可能表明贷款人正在积极寻求资金,但也可能反映出其信用状况不佳,被多家机构视为高风险。 - -8. **逾期趋势**:近6个月发放的贷款逾期次数、逾期月份数、逾期金额等均为0.0,这可能表明贷款人近期的还款情况有所改善,但仍需关注长期趋势。 - -9. **信用历史**:个人信用历史中包括各种法律纠纷、破产清算、诉讼等记录,这些都可能影响贷款人的信用评级和还款意愿。 - -综合以上风险点,贷款人可能存在较高的信用风险,需要进一步评估其还款能力和意愿,以及采取适当的风控措施,如提高贷款利率、要求额外的担保或抵押物等。<|im_end|> -""" - -similarity_test_cases = { - "qwen/Qwen2-7B-Instruct-GPTQ-Int8": - {"model_name": "qwen/Qwen2-7B-Instruct-GPTQ-Int8", "input": [long_input_10k], - "reference": ["你好!我是阿里云开发的一款大模型,我叫通义千问。<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.1 - }, - "qwen/Qwen2-7B-Instruct": - {"model_name": "qwen/Qwen2-7B-Instruct", "input": [long_input_10k], - "reference": [long_output_10k], - "generation_params": {"top_k": 1, "top_p": 0.8, "repetition_penalty": 1.05, "temperature": 0.7, "seed": 1234}, - "lang": "zh", - "compare": JaccardCompare(), "threshold": 0.5 - } -} - - -class ModelSimilarityTest(unittest.TestCase): - def setUp(self): - self.similarity_test_cases = similarity_test_cases - self.engine = allspark.Engine() - # 创建模型实例 - - def tearDown(self): - self.engine = None - gc.collect() - - def test_inference_qwen2_models_fp16(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct"], init_quant=False, test=self, set_engine_max_length=16000) - - def dis_test_inference_qwen2_models_int8_weight_only(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen2-7B-Instruct-GPTQ-Int8"], init_quant=True, set_engine_max_length=16000, - weight_only_quant=True, test=self) - - -if __name__ == '__main__': - unittest.main() - diff --git a/tests/python/gpu/test_10_qwen1_5_moe_a2_7b.py b/tests/python/gpu/test_10_qwen1_5_moe_a2_7b.py deleted file mode 100644 index 0e331912..00000000 --- a/tests/python/gpu/test_10_qwen1_5_moe_a2_7b.py +++ /dev/null @@ -1,240 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_10_qwen1_5_moe_a2_7b.py -''' -import os -import gc -import torch -import unittest - -import modelscope - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice -from dashinfer.allspark import AsStatus -from transformers import AutoTokenizer - - -class Qwen1_5_MoETestCase(unittest.TestCase): - def setUp(self): - self.batch_size = 8 - self.model_name = "qwen/Qwen1.5-MoE-A2.7B-Chat" - print("start test for ", self.model_name) - # download from modelscope - print("Downloading model from modelscope...") - self.model_path = modelscope.snapshot_download(self.model_name) - # whether to manully input prompts or not - self.manual_input_prompt = False - - def tearDown(self): - gc.collect() - - def disabled_test_moe_model_serialize(self): - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type="float16", - trust_remote_code=True, - ) - - engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - skip_if_exists=True, - ) - - def test_moe_model_inference(self): - data_type = "float16" - tokenizer = AutoTokenizer.from_pretrained( - self.model_path, trust_remote_code=True, padding_side="left" - ) - - sample_prompts = [ - "老鼠生病了可以吃老鼠药吗?", - "我的蓝牙耳机坏了,如何预约牙医?", - "安徽的省会是南京还是蚌埠?", - ] - - # prepare requests - requests = [] - for prompt in sample_prompts: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ] - text = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - input_ids = tokenizer([text], return_tensors="pt") - request = {"input_text": prompt, "torch_input": input_ids} - requests.append(request) - - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type=data_type, - trust_remote_code=True, - ) - - engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - skip_if_exists=True, - ) - - # prepare config - runtime_cfg_builder = model_loader.create_reference_runtime_config_builder( - safe_model_name, TargetDevice.CUDA, device_ids=[0, 1], max_batch=self.batch_size - ) - runtime_cfg_builder.max_length(2048) - runtime_cfg_builder.prefill_cache(False) - runtime_cfg = runtime_cfg_builder.build() - - gen_cfg_updates = { - "temperature": 0.7, - "top_k": 20, - "top_p": 0.9, - "seed": 1234, - "max_length": 1024, - "repetition_penalty": 1.05, - "length_penalty": 1.0, - } - gen_cfg_builder = model_loader.create_reference_generation_config_builder( - runtime_cfg - ) - gen_cfg_builder.update(gen_cfg_updates) - gen_cfg = gen_cfg_builder.build() - - # build model - engine.build_model_from_config_struct(runtime_cfg) - self.assertEqual( - engine.start_model(runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - # start requests - if self.manual_input_prompt == True: - while True: - print("***" * 20) - prompt = input("请输入一句prompt(输入end结束):") - if prompt == "end": - break - else: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ] - text = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - input_ids = tokenizer([text], return_tensors="pt") - request = {"input_text": prompt, "torch_input": input_ids} - status, request_handle, result_queue = engine.start_request( - runtime_cfg.model_name, - { - "input_ids": torch.utils.dlpack.to_dlpack( - request["torch_input"]["input_ids"] - ), - "attention_mask": torch.utils.dlpack.to_dlpack( - request["torch_input"]["attention_mask"] - ), - }, - generate_config=gen_cfg, - ) - print(status, request_handle, result_queue) - self.assertEqual( - engine.sync_request(runtime_cfg.model_name, request_handle), - AsStatus.ALLSPARK_SUCCESS, - ) - - while True: - status = result_queue.GenerateStatus() - if status == allspark.GenerateRequestStatus.GenerateFinished: - generated_elem = result_queue.Get() - output_ids = generated_elem.ids_from_generate - out_text = tokenizer.decode( - output_ids, skip_special_tokens=True - ) - print("in_text: " + request["input_text"]) - print("\n") - print("out_text: " + out_text) - print("***" * 20) - print("\n") - break - elif ( - status == allspark.GenerateRequestStatus.GenerateInterrupted - ): - break - - self.assertEqual( - engine.release_request(runtime_cfg.model_name, request_handle), - AsStatus.ALLSPARK_SUCCESS, - ) - - else: - for request in requests: - status, request_handle, result_queue = engine.start_request( - runtime_cfg.model_name, - { - "input_ids": torch.utils.dlpack.to_dlpack( - request["torch_input"]["input_ids"] - ), - "attention_mask": torch.utils.dlpack.to_dlpack( - request["torch_input"]["attention_mask"] - ), - }, - generate_config=gen_cfg, - ) - print(status, request_handle, result_queue) - self.assertEqual( - engine.sync_request(runtime_cfg.model_name, request_handle), - AsStatus.ALLSPARK_SUCCESS, - ) - - while True: - status = result_queue.GenerateStatus() - if status == allspark.GenerateRequestStatus.GenerateFinished: - generated_elem = result_queue.Get() - output_ids = generated_elem.ids_from_generate - out_text = tokenizer.decode( - output_ids, skip_special_tokens=True - ) - print("***" * 20) - print("in_text: " + request["input_text"]) - print("\n") - print("out_text: " + out_text) - print("***" * 20) - print("\n") - break - elif status == allspark.GenerateRequestStatus.GenerateInterrupted: - break - - self.assertEqual( - engine.release_request(runtime_cfg.model_name, request_handle), - AsStatus.ALLSPARK_SUCCESS, - ) - - self.assertEqual( - engine.stop_model(runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/python/gpu/test_11_qwen2_json.py b/tests/python/gpu/test_11_qwen2_json.py deleted file mode 100644 index 3c5768b6..00000000 --- a/tests/python/gpu/test_11_qwen2_json.py +++ /dev/null @@ -1,459 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_11_qwen2_json.py -''' -import os -import gc -import torch -import unittest -from concurrent.futures import ThreadPoolExecutor -import modelscope -import time -import sys -import json -from typing import Any, Optional, List -from dataclasses import field - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice -from dashinfer.allspark import AsStatus -from transformers import AutoTokenizer - -schemas = [] -# schemas.append(r''' -# { -# "properties": { -# "公司名称": { -# "type": "string" -# }, -# "founding year": { -# "type": "integer" -# }, -# "founding person": { -# "type": "string" -# }, -# "founding city": { -# "type": "string" -# }, -# "employees": { -# "type": "integer" -# } -# }, -# "required": [ -# "公司名称", -# "founding year", -# "founding person", -# "founding city", -# "employees" -# ], -# "type": "object" -# } -# ''') -schemas.append(r''' - { - "title": "Complex JSON Schema", - "description": "A very complex JSON schema with nested structures and multiple constraints.", - "type": "object", - "properties": { - "user": { - "type": "object", - "required": ["name", "age"], - "properties": { - "name": { - "type": "string", - "minLength": 2, - "maxLength": 50 - }, - "age": { - "type": "integer", - "minimum": 0, - "maximum": 120 - }, - "email": { - "type": "string", - "format": "email" - }, - "address": { - "type": "object", - "properties": { - "street": { - "type": "string" - }, - "city": { - "type": "string" - }, - "state": { - "type": "string" - }, - "postalCode": { - "type": "string", - "pattern": "^\\d{5}(?:-\\d{4})?$" - } - }, - "required": ["street", "city", "state", "postalCode"] - } - } - }, - "orders": { - "type": "array", - "items": { - "type": "object", - "required": ["id", "items"], - "properties": { - "id": { - "type": "string", - "pattern": "^[A-Z]{3}\\d{4}$" - }, - "items": { - "type": "array", - "items": { - "type": "object", - "required": ["product", "quantity"], - "properties": { - "product": { - "type": "string" - }, - "quantity": { - "type": "integer", - "minimum": 1 - }, - "price": { - "type": "number", - "minimum": 0 - } - } - } - } - } - } - }, - "preferences": { - "type": "object", - "properties": { - "language": { - "type": "string", - "enum": ["en", "es", "fr", "de", "it"] - }, - "notifications": { - "type": "boolean" - }, - "marketing": { - "type": "boolean" - } - } - }, - "metadata": { - "type": "object", - "properties": { - "created": { - "type": "string", - "format": "date-time" - }, - "lastUpdated": { - "type": "string", - "format": "date-time" - } - } - } - }, - "required": ["user", "orders"], - "additionalProperties": false - } -''') -schemas.append(r''' - { - "type": "object", - "properties": { - "value": { - "anyOf": [ - {"type": "null"}, - {"type": "number"}, - {"type": "string"}, - {"type": "boolean"}, - {"type": "object"}, - {"type": "array"} - ] - } - }, - "required": ["value"] - } -''') -schemas.append(r''' -{ - "type": "object", - "properties": { - "productName": { - "type": "string" - }, - "productType": { - "type": "string", - "enum": ["Electronics", "Books", "Clothing", "Home & Kitchen"] - }, - "price": { - "type": "number", - "minimum": 0 - }, - "placeOfProduction": { - "type": "string", - "enum": ["Thailand", "Japan", "Vietnam", "China", "Indonesia"] - }, - "monthOfProduction": { - "type": "string", - "enum": ["05", "06", "07", "08", "09", "10"] - }, - "yearOfProduction": { - "type": "string", - "enum": ["2020", "2021", "2022", "2023", "2024"] - } - }, - "required": ["productName", "productType", "price", "placeOfProduction", "monthOfProduction", "yearOfProduction"] -} -''') - -class Request: - id: int = -1 - input_text: Optional[str] = None - - # torch tensors: - input_ids = None - in_tokens = None - output_ids = None - output_tokens = None - - json_schema: Optional[str] = None - output_text: Optional[str] = None - - status: Optional[int] = None - gen_cfg: Optional[dict] = None - start_time = None - end_time = None - handle: Any = field(default=None) - queue: Any = field(default=None) - -class Qwen2_TestCase(unittest.TestCase): - def setUp(self): - self.batch_size = 3 - self.model_name = "qwen/Qwen2-7B-Instruct" - print("start test for ", self.model_name) - # download from modelscope - print("Downloading model from modelscope...") - self.model_path = modelscope.snapshot_download(self.model_name) - - def tearDown(self): - gc.collect() - - def print_output(self, request): - msg = "***********************************\n" - msg += f"* Answer for Request {request.id}\n" - msg += "***********************************\n" - msg += f"** encoded input, len: {request.in_tokens} **\n{request.input_ids}\n\n" - msg += f"** encoded output, len: {request.output_tokens} **\n{request.output_ids}\n\n" - msg += f"** text input **\n{request.input_text}\n\n" - msg += f"** text output **\n{request.output_text}\n\n" - elapsed_time = request.end_time - request.start_time - msg += f"** elapsed time **\n{elapsed_time}\n\n" - print(msg) - - - def disabled_test_model_serialize(self): - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type="float16", - trust_remote_code=True, - ) - - engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - skip_if_exists=True, - ) - - def process_one_request(self, request): - output_ids = [] - gen_cfg = self.gen_cfg_builder.build() - gen_cfg["response_format"] = {"type": "json_object", "json_schema": request.json_schema} - #gen_cfg["response_format"] = {"type": "json_object"} - request.start_time = time.time() - status, request_handle, result_queue = self.engine.start_request( - self.runtime_cfg.model_name, - { - "input_ids": torch.utils.dlpack.to_dlpack(request.input_ids), - }, - generate_config=gen_cfg, - ) - - if status != allspark.AsStatus.ALLSPARK_SUCCESS: - request.valid = False - print("[Error] Request failed to start!") - sys.exit(1) - - request.valid = True - request.handle = request_handle - request.queue = result_queue - request.status = int(status) - - while True: - # status = self.engine.get_request_status(self.model_name, request.queue) - status = request.queue.GenerateStatus() - request.status = int(status) - - if status == allspark.GenerateRequestStatus.Init: - pass - elif status == allspark.GenerateRequestStatus.Generating or status == allspark.GenerateRequestStatus.GenerateFinished: - if status == allspark.GenerateRequestStatus.GenerateFinished: - request.end_time = time.time() - # new_ids = self.engine.get_wait(self.model_name, request.queue) - generated_elem = request.queue.Get() - if generated_elem is not None: - new_ids = generated_elem.ids_from_generate - if (len(new_ids) > 0): - output_ids.extend(new_ids) - - request.output_ids = output_ids - request.output_tokens = len(output_ids) - request.output_text = self.tokenizer.decode(request.output_ids, skip_special_tokens=True) - - if status == allspark.GenerateRequestStatus.GenerateFinished: - # jsos.loads will throw exception if request.output_text is not a valid JSON string - json_str = json.loads(request.output_text) - break - elif status == allspark.GenerateRequestStatus.GenerateInterrupted: - request.valid = False - print("[Error] Request interrupted!") - break - else: - request.valid = False - print(f"[Error] Unexpected status: {status}") - break - - self.engine.release_request(self.runtime_cfg.model_name, request_handle=request.handle) - - def run_allspark_continuous_batch(self, request_list): - def done_callback(future): - request = future.argument - future.result() - self.print_output(request) - - # create a threadpool - executor = ThreadPoolExecutor(max_workers=self.batch_size) - - try: - # submit all tasks to the threadpool - futures = [] - for request in request_list: - future = executor.submit(self.process_one_request, request) - future.argument = request - future.add_done_callback(done_callback) - futures.append(future) - finally: - executor.shutdown(wait=True) - - def run_allspark_no_batching(self, request_list): - for request in request_list: - self.process_one_request(request) - self.print_output(request) - - def test_model_generate(self): - data_type = "float16" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_path, trust_remote_code=True, padding_side="left" - ) - - sample_prompts = [ - "小明在京东买了手机,他的相关信息", - "介绍一下杭州", - "使用JSON格式填写一部2024年7月在中国生产的价格为5999的iPhone 16的信息", - ] - - # prepare requests - requests = [] - req_id = 0 - for idx, prompt in enumerate(sample_prompts): - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ] - text = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - torch_input = self.tokenizer([text], return_tensors="pt") - input_ids = torch_input["input_ids"] - request = Request() - request.input_text = prompt - request.input_ids = input_ids - request.id = req_id - request.in_tokens = input_ids.shape[-1] - request.json_schema = schemas[idx] - requests.append(request) - req_id += 1 - - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type=data_type, - trust_remote_code=True, - ) - - self.engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - self.engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - skip_if_exists=True, - ) - - # prepare config - runtime_cfg_builder = model_loader.create_reference_runtime_config_builder( - safe_model_name, TargetDevice.CUDA, device_ids=[0], max_batch=self.batch_size - ) - runtime_cfg_builder.max_length(10240) - runtime_cfg_builder.prefill_cache(False) - self.runtime_cfg = runtime_cfg_builder.build() - - gen_cfg_updates = { - "temperature": 0.1, - "top_k": 1, - "top_p": 0.1, - #"seed": 1234, - "max_length": 10240, - #"repetition_penalty": 1.05, - #"length_penalty": 1.0, - } - self.gen_cfg_builder = model_loader.create_reference_generation_config_builder( - self.runtime_cfg - ) - self.gen_cfg_builder.update(gen_cfg_updates) - - # build model - self.engine.build_model_from_config_struct(self.runtime_cfg) - self.assertEqual( - self.engine.start_model(self.runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - self.run_allspark_continuous_batch(requests) - # self.run_allspark_no_batching(requests) - - self.assertEqual( - self.engine.stop_model(self.runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/python/gpu/test_12_dynamic_quant.py b/tests/python/gpu/test_12_dynamic_quant.py deleted file mode 100644 index ddb4b1e3..00000000 --- a/tests/python/gpu/test_12_dynamic_quant.py +++ /dev/null @@ -1,178 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_12_dynamic_quant.py -''' -import os -import gc -import time -import unittest - -import modelscope - -from dashinfer import allspark -from test_utils import LevenshteinCompare, CosineCompare, JaccardCompare, GenerateResultCompare -from test_util_infer import func_test_model_with_reference - -# check log by: -# cat log | grep "output text" -A3 -B6 - - -################################ -# Supported Quant Config. -################################ -# simplied quant config with per-channel int8 -simpled_a16w8_per_channel_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "int8"} - -simpled_fp8a8w8_per_tensor_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "fp8_e4m3", - "compute_method": "activate_quant"} - -simple_a16w8_group128_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "int8", - "group_size": 128} - - -simpled_a8w8_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "int8", - "compute_method" : "activate_quant"} - -simpled_a16w4_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "uint4"} - -##################### -# Unsupported Config. -##################### - -simpled_a8w8_group_128_customized_quant_config = { - "quant_method": "instant_quant", - "weight_format": "int8", - "group_size": 128, - "compute_method": "activate_quant"} - -#################################### - - -similarity_test_cases = { - "qwen/Qwen-7B-Chat": - {"model_name": "qwen/Qwen-7B-Chat", "input": [ - "帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "你犹如璀璨星辰,照亮了我们的项目之路;你的存在,如同瑰宝般珍贵,让我们的项目熠熠生辉。没有你的参与,我们的项目就如同失去灵魂的躯壳,注定走向失败。感谢你的付出和努力,让我们能够成功完成这个项目。你是我们团队的灵魂,是我们项目的无价之宝"], - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.1 - }, - "qwen/Qwen2-7B-Instruct": - {"model_name": "qwen/Qwen2-7B-Instruct", "input": ["静夜思这首诗是谁写的?只回答作者名字。"], - "reference": ["李白<|im_end|>"], - "lang": "zh", - "compare": LevenshteinCompare(), "threshold": 0.8 - }, - "Qwen/Qwen2.5-7B-Instruct-weight-only": - {"model_name": "Qwen/Qwen2.5-7B-Instruct", "input": [ - "帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "没有您的鼎力相助,此项目恐将陷入万劫不复之境,最终化为乌有!<|im_end|>"], - "generation_params": {"top_k": 20, "top_p": 0.8, "repetition_penalty": 1.1, "temperature": 0.7, "seed": 1234}, - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.1 - }, - "Qwen/Qwen2.5-7B-Instruct-active-quant": - {"model_name": "Qwen/Qwen2.5-7B-Instruct", "input": [ - "帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "没有您的鼎力相助与倾情投入,此项目早已在无形中折翼沉沦,化为乌有矣!<|im_end|>"], - "generation_params": {"top_k": 20, "top_p": 0.8, "repetition_penalty": 1.1, "temperature": 0.7, "seed": 1234}, - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.1 - }, - "Qwen/Qwen2.5-7B-Instruct-active-quant-fp8": - {"model_name": "Qwen/Qwen2.5-7B-Instruct", "input": [ - "帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "没有您的鼎力相助与倾情投入,此项目早已在无形中折翼沉沦,化为乌有矣!<|im_end|>"], - "generation_params": {"top_k": 20, "top_p": 0.8, "repetition_penalty": 1.1, "temperature": 0.7, "seed": 1234}, - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.0 - }, - "Qwen/Qwen2.5-7B-Instruct-a16w4": - {"model_name": "Qwen/Qwen2.5-7B-Instruct", "input": [ - "帮我用华丽的词藻润色夸奖文案,一定要非常浮夸,天花乱坠一点,可以简短一些。下面是我的原始文本:没有你的参与项目就失败了"], - "reference": [ - "没有您的鼎力支持,此项目犹如一叶扁舟在惊涛骇浪中沉没,幸得巨浪化为甘霖,方得凤凰涅槃,重获新生。您之贡献,犹如璀璨星辰点缀夜空,令整个宇宙熠熠生辉。<|im_end|>"], - "lang": "zh", - "compare": CosineCompare(), "threshold": 0.0 - }, -} - - -class ModelSimilarityTest(unittest.TestCase): - def setUp(self): - self.similarity_test_cases = similarity_test_cases - self.engine = allspark.Engine() - # 创建模型实例 - - def tearDown(self): - self.engine = None - gc.collect() - - def func_test_model_with_reference(self, test_dict, init_quant=False, test=None, weight_only_quant=True) -> float: - # self.engine = None - # let engine destroy, free all resources. - # gc.collect() - # full gc, make engine destroy called. - return func_test_model_with_reference(test_dict, self.engine, init_quant, test, - weight_only_quant=weight_only_quant) - - - - def test_a16w8_per_chn_qwen1(self): - func_test_model_with_reference(similarity_test_cases["qwen/Qwen-7B-Chat"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a16w8_per_channel_customized_quant_config, test=self) - - def test_a16w8_per_chn_qwen25(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-weight-only"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a16w8_per_channel_customized_quant_config, - test=self) - def test_a16w8_per_chn_kv8_qwen25(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-weight-only"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a16w8_per_channel_customized_quant_config, - test=self) - - def test_a16w8_per_chn_kv4_qwen25(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-weight-only"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a16w8_per_channel_customized_quant_config, cache_quant_mode="8", - test=self) - - def test_a16w8_sub_chn_qwen25(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-weight-only"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simple_a16w8_group128_customized_quant_config, test=self) - - def test_a8w8_per_chn_qwen25(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-active-quant"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a8w8_customized_quant_config, test=self) - - def test_fp8_a8w8_per_tensor_qwen25(self): - import torch - if torch.cuda.get_device_capability()[0] == 9 or (torch.cuda.get_device_capability()[0] == 8 and torch.cuda.get_device_capability()[1] == 9): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-active-quant-fp8"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_fp8a8w8_per_tensor_customized_quant_config, test=self) - else: - pass - - def test_a16w4_per_chn_qwen25_bf16(self): - func_test_model_with_reference(similarity_test_cases["Qwen/Qwen2.5-7B-Instruct-a16w4"], init_quant=True, - weight_only_quant=True, device_list=[0], quant_config=simpled_a16w4_customized_quant_config, test=self) - - def test_a16w4_sub_chn_qwen25(self): - # TODO: add expect exception test case. - pass - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/python/gpu/test_di_model_config.py b/tests/python/gpu/test_di_model_config.py deleted file mode 100644 index ddaca218..00000000 --- a/tests/python/gpu/test_di_model_config.py +++ /dev/null @@ -1,66 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_di_model_config.py -''' -import gc -import time -import unittest - -import modelscope - -class DIModelConfigTest(unittest.TestCase): - def setUp(self): - pass - - def tearDown(self): - pass - - - def test_yaml_dict(self): - import sys - from ruamel.yaml import YAML - - yaml_str = """\ - first_name: Art - occupation: Architect # This is an occupation comment - about: Art Vandelay is a fictional character that George invents... - """ - - yaml = YAML() - data = yaml.load(yaml_str) - data.insert(0, 'first_name', 'some name', comment="name comments") - data.insert(1, 'last name', 'Vandelay', comment="new key") - - - yaml.dump(data, sys.stdout) - - - import sys - import ruamel.yaml - - yaml = ruamel.yaml.YAML() # defaults to round-trip - - inp = """\ - abc: - - a # comment 1 - xyz: - a: 1 # comment 2 - b: 2 - c: 3 - d: 4 - e: 5 - f: 6 # comment 3 - """ - - data = yaml.load(inp) - data['abc'].append('b') - data['abc'].yaml_add_eol_comment('comment 4', 1) # takes column of comment 1 - data['xyz'].yaml_add_eol_comment('comment 5', 'c') # takes column of comment 2 - data['xyz'].yaml_add_eol_comment('comment 6', 'e') # takes column of comment 3 - data['xyz'].yaml_add_eol_comment('comment 7\n\n# that\'s all folks', 'd', column=20) - - yaml.dump(data, sys.stdout) - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/python/gpu/test_di_model_config_01.yaml b/tests/python/gpu/test_di_model_config_01.yaml deleted file mode 100644 index c94205a6..00000000 --- a/tests/python/gpu/test_di_model_config_01.yaml +++ /dev/null @@ -1,191 +0,0 @@ -# this config is describing model's basic information without depends on transfomers python frameworks etc. -# for easier setup and correct inference result. - -# model meta info part, usually imported from huggingface config.json -# example for qwen1.5 1.8B -model: - huggingface_id: Qwen/Qwen1.5-1.8B-Chat # optional, huggingface id can use to tokenizer etc. - modelscope_id: Qwen/Qwen1.5-1.8B-Chat # optional, modelscope id tokenizer id - architectures: - - Qwen2ForCausalLM - hidden_size: 2048 - max_position_embeddings: 32768 - num_attention_heads: 16 - num_hidden_layers: 24 - num_key_value_heads: 16 - rms_norm_eps: 1e-06 - rope_theta: 1000000.0 - sliding_window: 32768 - use_sliding_window: FALSE - vocab_size: 151936 - - quantization_config: - # enable quantization or not - enable: FALSE - - # quant activate format: - # this string describe which compute format should choose, which only work if compute mode == activate_quant - # currently following settings was supported: - # - int8 : signed int with 8 bit width store the weight and compute[if disable weight only] - # - fp8_e4m3 : nvidia fp8 e4m3 for weight store and compute # currently not supported yet. - activate_format: int8 - - # quant weight format: - # this string describe which weight width - # currently following settings was supported: - # - int8 : signed int with 8 bit width store the weight and compute[if disable weight only] - # - uint4 : unsigned int with 4 bit width store the weight and compute[if disable weight only] - # - fp8_e4m3 : nvidia fp8 e4m3 for weight store and compute # currently not supported yet. - weight_format: int8 - - # compute method: - # choose which compute method - # - weight_only: means the activation use fp16 and weight stored by `quant_format` - # - activate_quant: means use compute quantization, the activation and compute use the `quant_format`, **if choose compute, only `-1` is supported in group_size setting** - compute_method: weight_only # choose between ["weight_only", "activate_quant"] - - # group size: - # this setting means the group size per-token quantization. - # -1: means one token use one quantization parameters(one scale, and zero point), this means per-token/per-channel quantization, this setting was recommended for most case. - # 128: means each 128 elements in GEMM share one group of quantization parameters(one scale, one zero point), - # other_value: value in [32, 64, 128, 256, 512] is supported, other value is not supported in kernel. - # how to choose gorup size: less group size means more quantization parameter, usually means less precision drop, but less performance. - group_size: -1 - - # detail group setting: - # the customized group setting for each layer, different layer use different group setting. - # such as : {r"\.c_proj.weight": 32, r".*": 128}, means some layer use 32 group , some layer use 128 group. - group_settings: "" - - # quant method setting: - # choose which quantization method use. - # - instant_quant : this value is dynamic quantization provided by allspark engine, it doesn't require Post-Training [suggested] - # - gptq: model was fine-tuned by GPTQ method. - # - awq: model was fine-tuned by awq method, currently not supported yet. - quant_method: "instant_quant", # choose between ["gptq", "instant_quant"] - - - -# TODO:add lora. - -# runtime config part -runtime_config: - model_name: default_model_id # the model's ID in engine, - - # config part for compute device CPU and GPU - compute_unit: - # device type can be [CUDA, CPU, CPU_NUMA] - device_type: CUDA - - # for CUDA, the len(device_ids) is card number, and compute_thread_in_device can be ignored. - # for CPU, the len(device_ids) is ignored, - # - # for CPU_NUMA, the device_ids is NUMA id's, and len(device_ids) is NUMA Count - device_ids: - - 0 - - 1 - - # compute_thread_in_device only works for CPU - # if device_type is CPU: compute_thread_in_device is how many compute thread when inference, - # suggest value is physical core number(without hyper-thread), or you can pass 0 to let autodetect. - # if device_type is CPU_NUMA: compute_thread_in_device means compute thread inside NUMA, - # suggest value is physical core number(without hyper-thread), or you can pass 0 to let autodetect. - compute_thread_in_device: 0 - - # max length of engine support: for input+output, will allocate resource and warm up by this length. - engine_max_length: 2048 - - # max batch or concurrency supported by this engine, will reject the request if this size meets. - engine_max_batch: 8 - - # kv-cache mode, choose between : [AsCacheDefault, AsCacheQuantI8, AsCacheQuantU4] - # which means : - # - AsCacheDefault - FP16 or BF16 KV-Cache - # - AsCacheQuantI8 - int8 KV-Cache - # - AsCacheQuantU4 - uint4 KV-Cache - kv_cache_mode: AsCacheDefault - - # how to choose eviction request when kv-cache is full for GPU - eviction_strategy : MaxLength - - # prefill prefix caching function related settings, if you have lots of common prefix in prompts, this function is strongly suggested - enable_prefix_cache: TRUE - - # try sparse compute, this config will enable sparse compute, it will try to check the weight enable sparse compute - # currently, the nvidia GPU's 2:4 sparse is supported, it will check weight, whether this weight satisfy nvidia's 2:4 sparse rule - # value can be: - # - none - use dense tensor core - # - nv_tensor_sparse_2_4 - use 2:4 sparse tensor core. - sparse_compute: none - - cuda_mem: - # how many gpu memory allocated by this engine is caculated by this formula: - # (TOTAL_DEVICE_GPU_MEM_MB - RESERVE_SIZE_MB) * MEMORY_RATIO - # TOTAL_DEVICE_GPU_MEM_MB - how many gpu memory you device have, if multiple devices have different memory, - # will choose the least one - # RESERVE_SIZE_MB - defined by reserve_size_mb's value. - # MEMORY_RATIO - defined by memory_ratio's value - - # for cuda device, this ratio is how many should engine allocate memory, - # this config will override the BFC_MEM_RATIO envvar. - # set this to -1 means use "BFC_MEM_RATIO" env var's settings. - # comment out following config will use system env, otherwise it will override system device. - # memory_ratio: 0.96 - - # for cuda device, this config is for setup how many memory should reserve in Mega-Bytes(MB) - # reserve_size_mb: 600 - - # settings for rope scaling method. - rope_scaling: - type: yarn - # key-value config for each repo scaling method. - # this key-value config will forward to engine's runtime config - scaling_config: # eg yarn - factor: 4.0, - original_max_position_embeddings: 32768 - - # TODO: add d-ntk exmaple - -# generation config part -generation_config: - bos_token_id: 151643 - pad_token_id: 151643 - do_sample: TRUE - early_stopping: TRUE - - eos_token_id: - - 151645 - - 151643 - - seed: 13653 # default seed. - - top_k: 20 - repetition_penalty: 1.1 - length_penalty: 0.0 - presence_penalty: 1.0 - top_p: 0.8 - logprobs: FALSE - top_logprobs: 4 - - min_length: 2 - - # max length for this request, input + output, if max length is reached, engine will finish this request. - # this value should update together with runtime_config.engine_max_length and input length. - # the config value is just for a default value. - max_length: 2048 - - - -# quantization part -## note: quantization config only works on serialization step, so for a *.dimodel, -## serialization config will only for display information, it cannot change the behavior - -# tokenizer part -# it support use model hub's AutoTokenizer to set up tokenizer, also can use path to download -tokenizer: - source: modelscope # support [modelscope, huggingface, local] - # will report error if huggingface_id, or modelscope_id is not set correctly. - local_path: USER_DEFINE_TOKENIZER_PATH # require this path can be called by transformers.AutoTokenizer() - - - diff --git a/tests/python/gpu/test_runtime_config.py b/tests/python/gpu/test_runtime_config.py deleted file mode 100644 index a6d1b712..00000000 --- a/tests/python/gpu/test_runtime_config.py +++ /dev/null @@ -1,48 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_runtime_config.py -''' -# 引入必要的类和方法 -from dashinfer.allspark._allspark import AsModelConfig - -from dashinfer.allspark.runtime_config import AsModelRuntimeConfigBuilder -import unittest - -# 定义测试类 -class TestAsModelRuntimeConfigBuilder(unittest.TestCase): - - # 定义测试'from_dict'方法的正常行为 - def test_from_dict_normal_behavior(self): - # 构造预期结果 - expected_model_config = AsModelConfig() - expected_model_config.model_name = "test_model" - expected_model_config.compute_unit = "CUDA:0,1" - expected_model_config.engine_max_length = 100 - expected_model_config.engine_max_batch = 32 - expected_model_config.num_threads = 2 - - # 构造输入字典 - input_dict = { - 'model_name': 'test_model', - 'compute_unit': { - 'device_type': 'cuda', - 'device_ids': [0, 1], - 'compute_thread_in_device': 2 - }, - 'engine_max_length': 100, - 'engine_max_batch': 32 - } - - # 创建Builder实例并调用from_dict方法 - builder = AsModelRuntimeConfigBuilder() - builder.from_dict(input_dict) - - # 获取实际结果并进行断言 - actual_model_config = builder.build() # Assuming there is a to_config method to get the final config - print("expect model config: ", expected_model_config) - print("model config : ", actual_model_config) - self.assertEqual(str(expected_model_config), str(actual_model_config)) - -# 运行测试 -if __name__ == '__main__': - unittest.main() diff --git a/tests/python/gpu/test_util_infer.py b/tests/python/gpu/test_util_infer.py deleted file mode 100644 index bcb93ec5..00000000 --- a/tests/python/gpu/test_util_infer.py +++ /dev/null @@ -1,269 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_util_infer.py -''' - - -import os -import unittest -import oss2 -from oss2.credentials import EnvironmentVariableCredentialsProvider -from concurrent.futures import ThreadPoolExecutor -import configparser - -import modelscope - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice, RequestStatInfo -from dashinfer.allspark.engine import RoPEScaleMethod -from dashinfer.allspark.prompt_utils import PromptTemplate -from test_utils import LevenshteinCompare, CosineCompare, JaccardCompare, GenerateResultCompare -from dashinfer.allspark._allspark import AsStatus, GenerateRequestStatus, AsCacheMode - - -def get_auth(): - auth = None - endpoint = None - if os.environ.get("OSS_ACCESS_KEY_ID"): - auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider()) - endpoint = os.environ.get("OSS_ENDPOINT") - else: - config_path = os.path.expanduser('~/.ossutilconfig') - if not os.path.exists(config_path): - return None - cfg = configparser.ConfigParser() - cfg.read(config_path) - if 'Credentials' not in cfg: - return None - conf = cfg['Credentials'] - access_id = conf.get('accessKeyID') - access_secret = conf.get('accessKeySecret') - if access_id is None or access_secret is None: - return None - endpoint = conf.get('endpoint') - auth = oss2.Auth(access_id, access_secret) - return auth, endpoint - -def download_folder_from_oss(oss_folder_key, output_dir, max_workers=10, bucket="hci-team-private"): - auth, endpoint = get_auth() - - print(f"Download model to {output_dir}", flush=True) - bucket = oss2.Bucket(auth, endpoint, bucket) - - def download_object(object_info): - try: - object_name = object_info.key - local_name = object_name[len(oss_folder_key):] - local_file_path = os.path.join(output_dir, local_name) - bucket.get_object_to_file(object_name, local_file_path) - except Exception as e: - print(e) - - with ThreadPoolExecutor(max_workers=max_workers) as executor: - for object_info in oss2.ObjectIterator(bucket, prefix=oss_folder_key): - executor.submit(download_object, object_info) - -def func_test_model_with_reference(test_dict, engine=None, init_quant=False, weight_only_quant=False, test:unittest.TestCase=None, - ms_download=True, in_memory=True, user_set_data_type="bfloat16", - quant_config = None, - cache_quant_mode = "16", - set_engine_max_length = 2048, - set_prefill_cache=False, - user_runtime_config_dict = {}, - model_local_path="", direct_load = False, load_format = None, device_list=[0, 1], enable_sparsity_matmul=False) -> float: - if load_format: - assert direct_load == True - tmp_dir = "model_output" - print(test_dict) - print(quant_config) - print(f"kv cache bit width: {cache_quant_mode}") - modelscope_name = test_dict["model_name"] - compare: GenerateResultCompare = test_dict["compare"] - threshold = test_dict["threshold"] - gen_params_update = test_dict.get("generation_params", None) - - - input_list = test_dict["input"] - reference_list = test_dict["reference"] - # download model - model_model_path = "" - if ms_download: - model_model_path = modelscope.snapshot_download(modelscope_name) - else: - model_model_path = model_local_path - - safe_model_name = str(modelscope_name).replace("/", "_") - - model_loader = allspark.HuggingFaceModel(model_model_path, safe_model_name, in_memory_serialize=in_memory, - user_set_data_type=user_set_data_type, - trust_remote_code=True) - if engine is None: - engine = allspark.Engine() - # convert model - - (model_loader.load_model(direct_load=direct_load, load_format=load_format) - .read_model_config() - # .set_rope_scale_method(RoPEScaleMethod.YaRN, {"factor": 4.0, "original_max_position_embeddings": 32768, "type": "yarn"}) - .serialize(engine, model_output_dir=tmp_dir, enable_quant=init_quant, customized_quant_config=quant_config, weight_only_quant=weight_only_quant) - .free_model()) - - runtime_cfg_builder = model_loader.create_reference_runtime_config_builder(safe_model_name, TargetDevice.CUDA, - device_list, max_batch=1) - # like change to engine max length to a smaller value - runtime_cfg_builder.max_length(set_engine_max_length) - runtime_cfg_builder.prefill_cache(set_prefill_cache) - runtime_cfg_builder.enable_sparsity_matmul(enable_sparsity_matmul) - - runtime_cfg_builder.update_from_dict(user_runtime_config_dict) - - # like enable int8 kv-cache or int4 kv cache rather than fp16 kv-cache - if cache_quant_mode != "16": - if cache_quant_mode == "8": - runtime_cfg_builder.kv_cache_mode(AsCacheMode.AsCacheQuantI8) - elif cache_quant_mode == "4": - runtime_cfg_builder.kv_cache_mode(AsCacheMode.AsCacheQuantU4) - - runtime_cfg = runtime_cfg_builder.build() - # install model to engine - engine.install_model(runtime_cfg) - - # if in memory serialize, after install model, the serialize file can be free. - if in_memory: - model_loader.free_memory_serialize_file() - - # start the model inference - engine.start_model(safe_model_name) - - for i in range(len(input_list)): - input_str_origin = input_list[i] - reference_text = reference_list[i] - input_str = PromptTemplate.apply_chatml_template(input_str_origin) - # generate a reference generate config. - gen_cfg = model_loader.create_reference_generation_config_builder(runtime_cfg) - # change generate config base on this generation config, like change top_k = 1 - gen_cfg.update({"top_k": 1}) - if gen_params_update: - gen_cfg.update(gen_params_update) - - try: - - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": input_str_origin} - ] - templated_str = input_str_origin - try: - templated_str = model_loader.init_tokenizer().get_tokenizer().apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - except: - print("Exception on token chat template.") - templated_str = PromptTemplate.apply_chatml_template(input_str_origin) - - print(f"our template: {templated_str} input text: {input_str}") - - # 1. use text interface do inference. - status, handle, queue = engine.start_request_text(safe_model_name, - model_loader, - templated_str, - gen_cfg) - if test: - test.assertEqual(status, AsStatus.ALLSPARK_SUCCESS) - - # sync will wait request finish, like a sync interface, but you can async polling the queue. - # without this call, the model result will async running, result can be fetched by queue - # until queue status become generate finished. - - engine.sync_request(safe_model_name, handle) - - # after sync, you can fetch all the generated id by this api, this api is a block api - # will return when there new token, or generate is finished. - generated_elem = queue.Get() - # after get, engine will free resource(s) and token(s), so you can only get new token by this api. - - stat_dict = queue.RequestStatInfo() - - test.assertGreater(len(stat_dict.keys()), 1) - - req_stat = RequestStatInfo(stat_dict) - print(f"request state {req_stat}, request key: {stat_dict}") - - # de-tokenize id to text - output_text = model_loader.init_tokenizer().get_tokenizer().decode(generated_elem.ids_from_generate) - print("---" * 20) - print( - f"test case: {modelscope_name} {quant_config} kv-cache: {cache_quant_mode} input:\n{input_str} \n output text:\n{output_text}\n reference:\n{reference_text}") - print(f"input token:\n {model_loader.init_tokenizer().get_tokenizer().encode(input_str)}") - print(f"output token:\n {generated_elem.ids_from_generate}") - print("---" * 20) - # compare them with reference, and compare method. - sim = compare.normal_similarity(output_text, reference_text, lang=test_dict["lang"]) - print(f"{modelscope_name} sim: {sim} threshold: {threshold}") - if (test): - test.assertGreaterEqual(sim, threshold, "similarity is not ok") - - engine.release_request(safe_model_name, handle) - - # inference method 2: send id request. - tokenizer = model_loader.init_tokenizer().get_tokenizer() - encode_ids = tokenizer.encode(templated_str) - - print(encode_ids) - if test: - test.assertEqual(type(encode_ids), type([])) - status2, handle2, queue2 = engine.start_request_ids(safe_model_name, - model_loader, - encode_ids, - gen_cfg) - if test: - test.assertEqual(AsStatus.ALLSPARK_SUCCESS, status2) - - generated_ids2 = [] - # async fetch output result. - # looping until status is not ok - print(f"2 request: status: {queue2.GenerateStatus()}") - status = queue2.GenerateStatus() - - ## in following 3 status, it means tokens are generating - while (status == GenerateRequestStatus.Init - or status == GenerateRequestStatus.Generating - or status == GenerateRequestStatus.ContextFinished): - print(f"request: status: {queue2.GenerateStatus()}") - elements = queue2.Get() - if elements is not None: - print(f"new token: {elements.ids_from_generate}") - generated_ids2 += elements.ids_from_generate - status = queue2.GenerateStatus() - if status == GenerateRequestStatus.GenerateFinished: - break - # This means generated is finished. - if status == GenerateRequestStatus.GenerateInterrupted: - break - # This means the GPU has no available resources; the request has been halted by the engine. - # The client should collect the tokens generated so far and initiate a new request later. - - if test: - test.assertEqual(queue2.GenerateStatus(), GenerateRequestStatus.GenerateFinished) - print(f"generated id: {queue2.GenerateStatus()} {generated_ids2}") - - output_text2 = model_loader.init_tokenizer().get_tokenizer().decode(generated_ids2) - sim = compare.normal_similarity(output_text2, reference_text, lang=test_dict["lang"]) - - print("---" * 20) - - print( - f"[ids] [async] test case: {modelscope_name} quant_config:{quant_config} input:\n{input_str} \n output:\n{output_text2}\n reference:\n{reference_text} simi:{sim}") - print(f"input token-2:\n {model_loader.init_tokenizer().get_tokenizer().encode(input_str)}") - print(f"output token-2:\n {generated_ids2}") - print("---" * 20) - - print(f"{modelscope_name} sim_for_2: {sim} threshold: {threshold}") - if test: - test.assertGreaterEqual(sim, threshold, "similarity is out of range") - - engine.release_request(safe_model_name, handle2) - # inference method 3: send a multimedia request - finally: - engine.stop_model(safe_model_name) - engine.release_model(safe_model_name) - # let - # FIXME: release module will hang. - return sim diff --git a/tests/python/gpu/test_utils.py b/tests/python/gpu/test_utils.py deleted file mode 100644 index 5a0527ea..00000000 --- a/tests/python/gpu/test_utils.py +++ /dev/null @@ -1,137 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_utils.py -''' -""" From GPT4: -在评估大型语言模型(LLM)生成的文本时,选择哪种相似度度量取决于您希望评估的内容类型和相似度方面。以下是关于何时使用这些度量的一些指导: - -### Levenshtein距离(编辑距离) - -### 余弦相似度 -- 适合衡量两个句子或文档在语义层面的相似度,通常用于高维向量空间模型,如TF-IDF或词嵌入(word embeddings)表示的文本。 -- 余弦相似度评估了向量化表示的文本之间的角度,这更关心文档或句子在话题或含义上的接近程度。 -- 适用于评估生成文本与参考标准在整体意义上的接近性,尤其是在生成长篇文章或段落时。 - -在实际应用中,为了全面评估语言模型生成结果的质量,可能需要结合多种度量,或者选择在特定任务中表现良好的度量来评估: - -- 对于生成的结构需要高度精确,如代码或公式,可以考虑Levenshtein距离。 -- 对于关注文本关键内容的生成,如摘要或关键词提取,可以考虑Jaccard相似度。 -- 对于评估表达相同意思的不同方式的生成,如文章、故事或对话,余弦相似度可能是更好的选择。 - -最终,无论选择哪种度量,最重要的是确保它反映了您的语言模型优化和评估目标。在某些情况下,还可能需要开发专门的评估方法来捕捉任务所需的特定文本属性。 - -To Run test test: it require install : - - ``` - pip install python-Levenshtei - pip install scikit-learn - ``` - -""" - -from typing import List - - -class JiebaTokenizer: - @classmethod - def tokenize(cls, text: str)-> List[str]: - import jieba - return jieba.lcut(text) - - -class GenerateResultCompare: - @staticmethod - def tokenize(text: str, lang: str) -> List[str]: - if lang == "zh": - return JiebaTokenizer.tokenize(text) - else: - return text.split() - - def normal_similarity(self, gen_result: str, ref: str, lang: str = "en") -> float: - raise NotImplementedError("base class not implement this function, use sub class.") - - -class LevenshteinCompare(GenerateResultCompare): - """ - 编辑距离测量从一个字符串转换到另一个字符串所需的最少单字符编辑操作数。每个操作包括插入、删除或替换一个字符。Python有多个库可以计算编辑距离,如python-Levenshtein。 - 安装python-Levenshtein库: - "require pip install python-Levenshtein" - - - 更适合评估生成的文本在单词层面上的精确度,例如拼写检查或OCR的输出。 - - 它对于单词的添加、删除或替换非常敏感,如果你的任务需要在字符层面上的准确度比较,编辑距离是个不错的选择。 - - 不适用于评估语义内容,因为即使两个句子语义相近但单词使用不同,编辑距离也可能很高。 - """ - - def normal_similarity(self, gen_result: str, ref: str, lang: str = "en") -> float: - """ - Args: - gen_result: - ref: - - Returns: [0,1.0] 0 means totally different strings, 1 means totally identical strings - """ - import Levenshtein - # 计算编辑距离 - distance = Levenshtein.distance(gen_result, ref) - - # 取两个字符串中较长的长度 - max_length = max(len(gen_result), len(ref)) - - # 防止除以0,如果两个字符串都为空,我们可以定义他们是完全相同的 - if max_length == 0: - raise ValueError("two empty strings") - - # 计算标准化编辑距离 - normalized_distance = distance / max_length - return 1.0 - normalized_distance - - -class JaccardCompare(GenerateResultCompare): - """ - 适合评估两个句子在词汇层面的重叠情况,意在衡量它们在词集层面的相似性。它忽略了单词顺序和频率的因素,用于评估内容相似性而非形式上的精确重现。 - 对于短文本或关键词提取等任务比较合适。 - """ - - def normal_similarity(self, gen_result: str, ref: str, lang: str = "en") -> float: - """ - Args: - gen_result: genereate result - ref: reference - - Returns: [0,1.0] 0 means totally different strings, 1 means totally identical strings - """ - - def jaccard_similarity(str1, str2): - words_str1 = set(self.tokenize(str1, lang=lang)) - words_str2 = set(self.tokenize(str2, lang=lang)) - return len(words_str1.intersection(words_str2)) / len(words_str1.union(words_str2)) - - similarity = jaccard_similarity(gen_result, ref) - return similarity - - -class CosineCompare(GenerateResultCompare): - """ - 余弦相似度评估了向量化表示的文本之间的角度,这更关心文档或句子在话题或含义上的接近程度。 - 适用于评估生成文本与参考标准在整体意义上的接近性,尤其是在生成长篇文章或段落时。 - """ - - def normal_similarity(self, gen_result: str, ref: str, lang: str = "en") -> float: - """ - Args: - gen_result: genereate result - ref: reference - - Returns: [0,1.0] 0 means totally different strings, 1 means totally identical strings - """ - from sklearn.feature_extraction.text import TfidfVectorizer - from sklearn.metrics.pairwise import cosine_similarity - - words_str1 = ' '.join(self.tokenize(gen_result, lang=lang)) - words_str2 = ' '.join(self.tokenize(ref, lang=lang)) - vectorizer = TfidfVectorizer() - tfidf = vectorizer.fit_transform([words_str1, words_str2]) - - similarity = cosine_similarity(tfidf)[0][1] - return similarity - diff --git a/tests/python/x86/dashinfer b/tests/python/x86/dashinfer deleted file mode 120000 index 39c1d414..00000000 --- a/tests/python/x86/dashinfer +++ /dev/null @@ -1 +0,0 @@ -/root/workspace/DashInfer/python/build/lib.linux-x86_64-cpython-310/dashinfer \ No newline at end of file diff --git a/tests/python/x86/test_11_qwen2_json.py b/tests/python/x86/test_11_qwen2_json.py deleted file mode 100644 index f9a94ff5..00000000 --- a/tests/python/x86/test_11_qwen2_json.py +++ /dev/null @@ -1,420 +0,0 @@ -''' - Copyright (c) Alibaba, Inc. and its affiliates. - @file test_11_qwen2_json.py -''' -import os -import gc -import torch -import unittest -from concurrent.futures import ThreadPoolExecutor -import modelscope -import time -import sys -from typing import Any, Optional, List -from dataclasses import field - -from dashinfer import allspark -from dashinfer.allspark.engine import TargetDevice -from dashinfer.allspark import AsStatus -from transformers import AutoTokenizer - -schemas = [] -schemas.append(r''' - { - "properties": { - "公司名称": { - "type": "string" - }, - "founding year": { - "type": "integer" - }, - "founding person": { - "type": "string" - }, - "founding city": { - "type": "string" - }, - "employees": { - "type": "integer" - } - }, - "required": [ - "公司名称", - "founding year", - "founding person", - "founding city", - "employees" - ], - "type": "object" - } -''') -schemas.append(r''' - { - "title": "Complex JSON Schema", - "description": "A very complex JSON schema with nested structures and multiple constraints.", - "type": "object", - "properties": { - "user": { - "type": "object", - "required": ["name", "age"], - "properties": { - "name": { - "type": "string", - "minLength": 2, - "maxLength": 50 - }, - "age": { - "type": "integer", - "minimum": 0, - "maximum": 120 - }, - "email": { - "type": "string", - "format": "email" - }, - "address": { - "type": "object", - "properties": { - "street": { - "type": "string" - }, - "city": { - "type": "string" - }, - "state": { - "type": "string" - }, - "postalCode": { - "type": "string", - "pattern": "^\\d{5}(?:-\\d{4})?$" - } - }, - "required": ["street", "city", "state", "postalCode"] - } - } - }, - "orders": { - "type": "array", - "items": { - "type": "object", - "required": ["id", "items"], - "properties": { - "id": { - "type": "string", - "pattern": "^[A-Z]{3}\\d{4}$" - }, - "items": { - "type": "array", - "items": { - "type": "object", - "required": ["product", "quantity"], - "properties": { - "product": { - "type": "string" - }, - "quantity": { - "type": "integer", - "minimum": 1 - }, - "price": { - "type": "number", - "minimum": 0 - } - } - } - } - } - } - }, - "preferences": { - "type": "object", - "properties": { - "language": { - "type": "string", - "enum": ["en", "es", "fr", "de", "it"] - }, - "notifications": { - "type": "boolean" - }, - "marketing": { - "type": "boolean" - } - } - }, - "metadata": { - "type": "object", - "properties": { - "created": { - "type": "string", - "format": "date-time" - }, - "lastUpdated": { - "type": "string", - "format": "date-time" - } - } - } - }, - "required": ["user", "orders"], - "additionalProperties": false - } -''') -schemas.append(r''' - { - "type": "object", - "properties": { - "value": { - "anyOf": [ - {"type": "null"}, - {"type": "number"}, - {"type": "string"}, - {"type": "boolean"}, - {"type": "object"}, - {"type": "array"} - ] - } - }, - "required": ["value"] - } -''') - -class Request: - id: int = -1 - input_text: Optional[str] = None - - # torch tensors: - input_ids = None - in_tokens = None - output_ids = None - output_tokens = None - - json_schema: Optional[str] = None - output_text: Optional[str] = None - - status: Optional[int] = None - gen_cfg: Optional[dict] = None - start_time = None - end_time = None - handle: Any = field(default=None) - queue: Any = field(default=None) - -class Qwen2_TestCase(unittest.TestCase): - def setUp(self): - self.batch_size = 3 - self.model_name = "qwen/Qwen2-0.5B-Instruct" - print("start test for ", self.model_name) - # download from modelscope - print("Downloading model from modelscope...") - self.model_path = modelscope.snapshot_download(self.model_name) - - def tearDown(self): - gc.collect() - - def print_output(self, request): - msg = "***********************************\n" - msg += f"* Answer for Request {request.id}\n" - msg += "***********************************\n" - msg += f"** encoded input, len: {request.in_tokens} **\n{request.input_ids}\n\n" - msg += f"** encoded output, len: {request.output_tokens} **\n{request.output_ids}\n\n" - msg += f"** text input **\n{request.input_text}\n\n" - msg += f"** text output **\n{request.output_text}\n\n" - elapsed_time = request.end_time - request.start_time - msg += f"** elapsed time **\n{elapsed_time}\n\n" - print(msg) - - - def disabled_test_model_serialize(self): - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type="float32", - trust_remote_code=True, - ) - - engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - skip_if_exists=True, - ) - - def process_one_request(self, request): - output_ids = [] - gen_cfg = self.gen_cfg_builder.build() - gen_cfg["response_format"] = {"type": "json_object", "json_schema": request.json_schema} - #gen_cfg["response_format"] = {"type": "json_object"} - request.start_time = time.time() - status, request_handle, result_queue = self.engine.start_request( - self.runtime_cfg.model_name, - { - "input_ids": torch.utils.dlpack.to_dlpack(request.input_ids), - }, - generate_config=gen_cfg, - ) - - if status != allspark.AsStatus.ALLSPARK_SUCCESS: - request.valid = False - print("[Error] Request failed to start!") - sys.exit(1) - - request.valid = True - request.handle = request_handle - request.queue = result_queue - request.status = int(status) - - while True: - # status = self.engine.get_request_status(self.model_name, request.queue) - status = request.queue.GenerateStatus() - request.status = int(status) - - if status == allspark.GenerateRequestStatus.Init: - pass - elif status == allspark.GenerateRequestStatus.Generating or status == allspark.GenerateRequestStatus.GenerateFinished: - if status == allspark.GenerateRequestStatus.GenerateFinished: - request.end_time = time.time() - # new_ids = self.engine.get_wait(self.model_name, request.queue) - generated_elem = request.queue.Get() - if generated_elem is not None: - new_ids = generated_elem.ids_from_generate - if (len(new_ids) > 0): - output_ids.extend(new_ids) - - request.output_ids = output_ids - request.output_tokens = len(output_ids) - request.output_text = self.tokenizer.decode(request.output_ids, skip_special_tokens=True) - - if status == allspark.GenerateRequestStatus.GenerateFinished: - break - elif status == allspark.GenerateRequestStatus.GenerateInterrupted: - request.valid = False - print("[Error] Request interrupted!") - break - else: - request.valid = False - print(f"[Error] Unexpected status: {status}") - break - - self.engine.release_request(self.runtime_cfg.model_name, request_handle=request.handle) - - def run_allspark_continuous_batch(self, request_list): - def done_callback(future): - request = future.argument - future.result() - self.print_output(request) - - # create a threadpool - executor = ThreadPoolExecutor(max_workers=self.batch_size) - - try: - # submit all tasks to the threadpool - futures = [] - for request in request_list: - future = executor.submit(self.process_one_request, request) - future.argument = request - future.add_done_callback(done_callback) - futures.append(future) - finally: - executor.shutdown(wait=True) - - def test_model_generate(self): - data_type = "float32" - self.tokenizer = AutoTokenizer.from_pretrained( - self.model_path, trust_remote_code=True, padding_side="left" - ) - - sample_prompts = [ - "阿里巴巴的相关信息", - "小明在京东买了手机,他的相关信息", - "介绍一下杭州", - ] - - # prepare requests - requests = [] - req_id = 0 - for idx, prompt in enumerate(sample_prompts): - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": prompt}, - ] - text = self.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - torch_input = self.tokenizer([text], return_tensors="pt") - input_ids = torch_input["input_ids"] - request = Request() - request.input_text = prompt - request.input_ids = input_ids - request.id = req_id - request.in_tokens = input_ids.shape[-1] - request.json_schema = schemas[idx] - requests.append(request) - req_id += 1 - - # prepare model loader - safe_model_name = str(self.model_name).replace("/", "_") - model_loader = allspark.HuggingFaceModel( - self.model_path, - safe_model_name, - in_memory_serialize=False, - user_set_data_type=data_type, - trust_remote_code=True, - ) - - self.engine = allspark.Engine() - - # load and serialize model - model_loader.load_model(direct_load=False, load_format="auto") - model_loader.serialize_to_path( - self.engine, - model_output_dir=self.model_path, - enable_quant=False, - weight_only_quant=False, - multinode_mode=False, - skip_if_exists=True, - ) - - # prepare config - runtime_cfg_builder = model_loader.create_reference_runtime_config_builder( - safe_model_name, TargetDevice.CPU, device_ids=[0], max_batch=self.batch_size - ) - runtime_cfg_builder.max_length(2048) - runtime_cfg_builder.prefill_cache(False) - self.runtime_cfg = runtime_cfg_builder.build() - - gen_cfg_updates = { - "temperature": 0.7, - "top_k": 20, - "top_p": 0.9, - "seed": 1234, - "max_length": 1024, - "repetition_penalty": 1.05, - "length_penalty": 1.0, - } - self.gen_cfg_builder = model_loader.create_reference_generation_config_builder( - self.runtime_cfg - ) - self.gen_cfg_builder.update(gen_cfg_updates) - - # build model - self.engine.build_model_from_config_struct(self.runtime_cfg) - self.assertEqual( - self.engine.start_model(self.runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - self.run_allspark_continuous_batch(requests) - - self.assertEqual( - self.engine.stop_model(self.runtime_cfg.model_name), AsStatus.ALLSPARK_SUCCESS - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/detect-lora-limit.py b/tools/detect-lora-limit.py new file mode 100644 index 00000000..2f145a21 --- /dev/null +++ b/tools/detect-lora-limit.py @@ -0,0 +1,190 @@ +''' + Copyright (c) Alibaba, Inc. and its affiliates. + @file detect-lora-limit.py +''' +import os, pdb +import multiprocessing +import sys +import torch + +from dashinfer import allspark +from dashinfer.allspark import AsStatus, AsModelConfig +from dashinfer.allspark.engine import TargetDevice +import argparse + + +global arg + + +def check_proc(lora_num, runtime_cfg_dict): + global args + model_name = runtime_cfg_dict['model_name'] + + print('trying lora_num=', lora_num) + runtime_cfg = AsModelConfig(model_name, runtime_cfg_dict['model_path'], runtime_cfg_dict['weights_path'], runtime_cfg_dict['compute_unit'], args.max_length, args.max_batch) + runtime_cfg.lora_max_rank = args.lora_max_rank + runtime_cfg.lora_max_num = lora_num + + ret = AsStatus.ALLSPARK_SUCCESS + try: + engine = allspark.Engine() + engine.install_model(runtime_cfg) + ret = engine.start_model(model_name) + engine.stop_model(model_name) + engine.release_model(model_name) + except BaseException as e: + print(str(e)) + return False + + if ret != AsStatus.ALLSPARK_SUCCESS: + print('start_model Error!', ret) + return False + print('lora_num=', lora_num, 'passed!') + return True + +def child_process(read_fd, write_fd, lora_num, runtime_cfg_dict): + os.close(read_fd) + ret = check_proc(lora_num, runtime_cfg_dict) + os.write(write_fd, str(ret).encode()) + os.close(write_fd) + return ret + + +def multi_process_check(lora_num, runtime_cfg_dict): + read_fd, write_fd = os.pipe() + pid = os.fork() + if pid < 0: + print('fork failed, exit!') + os._exit(1) + if pid == 0: # child process + child_process(read_fd, write_fd, lora_num, runtime_cfg_dict) + os._exit(0) + + # father process + os.close(write_fd) + child_ret = None + try: + data = os.read(read_fd, 100) + if data: + child_ret = data.decode().strip() + os.waitpid(pid, 0) + except: + child_ret = None + print('child crash!') + os.waitpid(pid, 0) + os.close(read_fd) + return False + if child_ret is None or child_ret != 'True': + os.close(read_fd) + return False + os.close(read_fd) + return True + + +def binary_search(min_num, max_num, runtime_cfg_dict): + left = min_num + right = max_num + + while left <= right: + mid = (left + right) // 2 + passed = multi_process_check(mid, runtime_cfg_dict) + if passed: + left = mid + 1 + else: + right = mid - 1 + return mid if passed else (mid -1) + + +def save_as_model(runtime_cfg_dict): + global args + pid = os.fork() + if pid < 0: + print('fork failed, exit!') + os._exit(1) + if pid > 0: # father process + os.waitpid(pid, 0) + print('=============================================save OK') + return + + # child process + + # specified by user cmdline args: + init_quant = args.quantization + device_list = list(range(args.parallel)) + + base_model_dir = runtime_cfg_dict['base_model_dir'] + output_model_dir = runtime_cfg_dict['output_model_dir'] + model_name = runtime_cfg_dict['model_name'] + in_memory = runtime_cfg_dict['in_memory'] + weight_only_quant = runtime_cfg_dict['weight_only_quant'] + + model_loader = allspark.HuggingFaceModel(base_model_dir, model_name, user_set_data_type="float16", in_memory_serialize=in_memory, trust_remote_code=True) + engine = allspark.Engine() + if in_memory: + (model_loader.load_model() + .read_model_config() + .serialize_to_memory(engine, enable_quant=init_quant, weight_only_quant=weight_only_quant, lora_cfg={}, + customized_quant_config= {"quant_method": "instant_quant", "weight_format": "int8"},) + .free_model()) + else: + (model_loader.load_model() + .read_model_config() + .serialize_to_path(engine, output_model_dir, enable_quant=init_quant, weight_only_quant=weight_only_quant, + skip_if_exists=True, lora_cfg={}, + customized_quant_config= {"quant_method": "instant_quant", "weight_format": "int8"}, + ) + .free_model()) + engine = None + runtime_cfg_builder = model_loader.create_reference_runtime_config_builder(model_name, TargetDevice.CUDA, + device_list, max_batch=args.max_batch) + runtime_cfg_builder.max_length(args.max_length).lora_max_rank(args.lora_max_rank) + runtime_cfg = runtime_cfg_builder.build() + runtime_cfg_dict['model_name'] = runtime_cfg.model_name + runtime_cfg_dict['model_path'] = runtime_cfg.model_path + runtime_cfg_dict['weights_path'] = runtime_cfg.weights_path + runtime_cfg_dict['engine_max_length'] = runtime_cfg.engine_max_length + runtime_cfg_dict['engine_max_batch'] = runtime_cfg.engine_max_batch + runtime_cfg_dict['compute_unit'] = runtime_cfg.compute_unit + os._exit(0) + + +def main(): + global args + os.environ['ALLSPARK_DISABLE_WARMUP'] = '0' + parser = argparse.ArgumentParser(description="") + parser.add_argument("-m", "--base_model_dir", type=str, required=True, help="dir of base model") + parser.add_argument("-r", "--lora_max_rank", type=int, required=True, help="lora max rank") + parser.add_argument("-q", "--quantization", action="store_true", help="true if toggled, which means quantizing base model to A16W8") + parser.add_argument("-p", "--parallel", type=int, required=True, help="num of GPU cards") + parser.add_argument("-b", "--max_batch", type=int, required=True, help="max batchsize") + parser.add_argument("-l", "--max_length", type=int, required=True, help="max length") + args = parser.parse_args() + print(args) + + # you can also change these parameters manually: + in_memory = True + weight_only_quant = True + + shared_manager = multiprocessing.Manager() + runtime_cfg_dict = shared_manager.dict() + runtime_cfg_dict['base_model_dir'] = args.base_model_dir # 基模目录 + runtime_cfg_dict['output_model_dir'] = "model_output.lora-detect" # in_memory==False时,.asparam模型的目录 + runtime_cfg_dict['model_name'] = "my_test_model" #.asparam模型名字 + runtime_cfg_dict['in_memory'] = in_memory + runtime_cfg_dict['weight_only_quant'] = weight_only_quant + + save_as_model(runtime_cfg_dict) + print(runtime_cfg_dict) + + + # 二分搜索的范围 + lower_lora_num = 1 + upper_lora_num = 100 + result = binary_search(lower_lora_num, upper_lora_num, runtime_cfg_dict) + print('Final detection result: lora_max_num=', result) + + if runtime_cfg_dict['in_memory']: + os.unlink(runtime_cfg_dict['weights_path']) + os.unlink(runtime_cfg_dict['model_path']) + +main() diff --git a/tools/model_scope_batch_convert.py b/tools/model_scope_batch_convert.py index fada9a55..c0fc3bee 100644 --- a/tools/model_scope_batch_convert.py +++ b/tools/model_scope_batch_convert.py @@ -41,6 +41,9 @@ def downlaod_ms_model_and_convert(modelscope_name, ms_version, output_base_folde # like change to engine max length to a smaller value runtime_cfg_builder.max_length(2048) + # if you want to change kv cache span size, valid value is 16, 32, 64, 128; default is 32 + # runtime_cfg_builder.kv_cache_span_size(128) + # like enable int8 kv-cache or int4 kv cache rather than fp16 kv-cache # runtime_cfg_builder.kv_cache_mode(AsCacheMode.AsCacheQuantI8) diff --git a/tools/tokenizer_tool_for_qwen.py b/tools/tokenizer_tool_for_qwen.py index 1aed13fa..c6e50755 100644 --- a/tools/tokenizer_tool_for_qwen.py +++ b/tools/tokenizer_tool_for_qwen.py @@ -2,7 +2,7 @@ Copyright (c) Alibaba, Inc. and its affiliates. @file tokenizer_tool_for_qwen.py ''' -from pyhie import allspark +from dashinfer import allspark import modelscope import argparse