diff --git a/.vscode/launch.json b/.vscode/launch.json index d7bf3daf1..92a096002 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,7 +4,7 @@ "name": "ops_cast_test", "type": "cppdbg", "request": "launch", - "program": "${workspaceFolder}/build/ark/ops_cast_test", + "program": "${workspaceFolder}/build/ark/executor_test", "args": [], "stopAtEntry": false, "cwd": "${fileDirname}", diff --git a/.vscode/settings.json b/.vscode/settings.json index 640196a66..1a376c337 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,4 +10,105 @@ "cmake.ctestArgs": [ "--verbose" ], + "files.associations": { + "ostream": "cpp", + "stdexcept": "cpp", + "string": "cpp", + "iosfwd": "cpp", + "memory": "cpp", + "stack": "cpp", + "any": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "cfenv": "cpp", + "chrono": "cpp", + "cinttypes": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "codecvt": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ranges": "cpp", + "semaphore": "cpp", + "span": "cpp", + "sstream": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "typeinfo": "cpp", + "valarray": "cpp", + "variant": "cpp", + "__nullptr": "cpp", + "__hash_table": "cpp", + "__split_buffer": "cpp", + "__tree": "cpp", + "queue": "cpp", + "__locale": "cpp", + "*.ipp": "cpp", + "strstream": "cpp", + "typeindex": "cpp", + "locale": "cpp", + "__node_handle": "cpp", + "__threading_support": "cpp", + "__functional_03": "cpp", + "filesystem": "cpp", + "__bit_reference": "cpp", + "__config": "cpp", + "__debug": "cpp", + "version": "cpp", + "__functional_base": "cpp", + "__memory": "cpp", + "*.ci": "c", + "ark_kernels.h": "c" + }, + "cmake.configureOnOpen": true, } diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt index 7c360ee37..f0c52352c 100644 --- a/ark/CMakeLists.txt +++ b/ark/CMakeLists.txt @@ -1,9 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) -file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc *_test.cu) -file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cc) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp) +file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp) +file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp) list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES}) if(USE_ROCM) diff --git a/ark/bfloat16.cc b/ark/bfloat16.cpp similarity index 100% rename from ark/bfloat16.cc rename to ark/bfloat16.cpp diff --git a/ark/bfloat16.h b/ark/bfloat16.h index da57348be..83a1bcb7d 100644 --- a/ark/bfloat16.h +++ b/ark/bfloat16.h @@ -144,6 +144,8 @@ struct alignas(2) bfloat16_t { int mantissa() const { return int(raw() & 0x7f); } }; +using bf16 = bfloat16_t; + /// Assignment from half_t template <> bfloat16_t& bfloat16_t::operator=(bfloat16_t const& x); diff --git a/ark/bfloat16_test.cc b/ark/bfloat16_test.cpp similarity index 99% rename from ark/bfloat16_test.cc rename to ark/bfloat16_test.cpp index 9c53deaf3..f64a41d8d 100644 --- a/ark/bfloat16_test.cc +++ b/ark/bfloat16_test.cpp @@ -3,7 +3,6 @@ #include "bfloat16.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_bfloat16() { @@ -246,7 +245,6 @@ ark::unittest::State test_bfloat16_error() { } int main() { - ark::init(); UNITTEST(test_bfloat16); UNITTEST(test_bfloat16_error); return 0; diff --git a/ark/codegen/codegen.cpp b/ark/codegen/codegen.cpp new file mode 100644 index 000000000..81f60a080 --- /dev/null +++ b/ark/codegen/codegen.cpp @@ -0,0 +1,432 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "codegen.hpp" + +#include + +#include "env.h" +#include "file_io.h" +#include "logging.h" +#include "model/model_data_type.hpp" +#include "model/model_op.hpp" +#include "model/model_tensor.hpp" +#include "nlohmann/json.hpp" +#include "range.hpp" + +static std::string replace( + const std::string &template_str, + const std::map &replacements) { + std::string result = template_str; + for (const auto &kv : replacements) { + size_t pos = 0; + while ((pos = result.find(kv.first, pos)) != std::string::npos) { + result.replace(pos, kv.first.length(), kv.second); + pos += kv.second.length(); + } + } + return result; +} + +namespace ark { + +class BufferInfo { + public: + BufferInfo(size_t id) : id(id), bytes(0), is_input(true), is_output(true) {} + + // ID of this buffer + const size_t id; + + // Total bytes of this buffer + size_t bytes; + + // True if none of tensors in this buffer is a result tensor or a write + // tensor of a non-virtual Op, i.e., this buffer is an input buffer + bool is_input; + + // True if none of tensors in this buffer is a read tensor of a non-virtual + // Op, i.e., this buffer is an output buffer + bool is_output; + + // IDs of tensors in this buffer + std::set tensor_ids; + + // IDs of tasks that read/write from/to this buffer + std::set task_ids; +}; + +class SyncStateInfo { + public: + SyncStateInfo() { + static size_t next_id = 0; + id = next_id++; + } + + size_t id; +}; + +class CodeGenerator::Impl { + public: + Impl(const std::string &plan, const std::string &name); + ~Impl() = default; + + private: + void plan_memory(const nlohmann::json &plan); + + std::string def_op(const nlohmann::json &op_json, size_t task_id, + size_t op_idx); + + std::string def_task(const nlohmann::json &task_json); + + std::string task_seq(size_t proc_b, size_t proc_e, size_t proc_s, + size_t proc_cur, size_t task_b, size_t task_e, + size_t task_s, size_t task_gran, size_t num_slots, + size_t slot_num_warps, size_t slot_sram_bytes, + size_t task_id); + + std::string resource_group(const nlohmann::json &rg_json, + const nlohmann::json &task_infos, + const Range &proc_range); + + protected: + friend class CodeGenerator; + + std::string name_; + size_t num_procs_; + size_t num_warps_per_proc_; + std::map> buffer_id_to_info_; + std::map buffer_id_to_offset_; + std::map tensor_id_to_offset_; + size_t total_bytes_; + std::string code_; +}; + +CodeGenerator::Impl::Impl(const std::string &plan, const std::string &name) + : name_(name) { + auto j = nlohmann::json::parse(plan); + this->plan_memory(j); + + num_procs_ = j["NumProcessors"]; + num_warps_per_proc_ = j["NumWarpsPerProcessor"]; + + std::stringstream definitions_ss; + for (auto &task_json : j["TaskInfos"]) { + definitions_ss << this->def_task(task_json); + } + + std::map, SyncStateInfo> sync_state_info; + + std::stringstream body_ss; + size_t pg_idx = 0; + size_t num_pgs = j["ProcessorGroups"].size(); + for (auto &pg : j["ProcessorGroups"]) { + Range proc_range(pg["ProcessorRange"][0], + pg["ProcessorRange"][1]); + + for (auto &rg : pg["ResourceGroups"]) { + body_ss << this->resource_group(rg, j["TaskInfos"], proc_range); + } + + if (pg_idx + 1 < num_pgs) { + // sync pg + size_t begin = *proc_range.begin(); + size_t end = *proc_range.end(); + if (begin == 0) { + body_ss << " if (blockIdx.x < " << end << ") {"; + } else if (begin + 1 == end) { + body_ss << " if (blockIdx.x == " << begin << ") {"; + } else { + body_ss << " if (blockIdx.x >= " << begin + << " && blockIdx.x < " << end << ") {"; + } + size_t state_id = sync_state_info[proc_range].id; + body_ss << " sync_gpu<" << end - begin << ">(ARK_LOOP_SYNC_STATE_" + << state_id << "); }\n"; + } + } + + for (auto &kv : sync_state_info) { + definitions_ss << "__device__ sync::State ARK_LOOP_SYNC_STATE_" + << kv.second.id << ";\n"; + } + + const std::string &ark_root = get_env().path_root_dir; + const std::string &template_path = + ark_root + "/include/kernels/kernel_template.in"; + std::string template_code = read_file(template_path); + std::map replacements = { + {"@NUM_BLOCKS@", std::to_string(num_procs_)}, + {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)}, + {"@DEFINITIONS@", definitions_ss.str()}, + {"@BODY@", body_ss.str()}, + {"@NAME@", name_}, + }; + code_ = replace(template_code, replacements); +} + +void CodeGenerator::Impl::plan_memory(const nlohmann::json &plan) { + auto get_or_create_buffer_info = [&](size_t buffer_id) { + if (buffer_id_to_info_.find(buffer_id) == buffer_id_to_info_.end()) { + auto buf_info = std::make_shared(buffer_id); + buffer_id_to_info_[buffer_id] = buf_info; + return buf_info; + } + return buffer_id_to_info_[buffer_id]; + }; + + auto tensor_stride_bytes = [](const nlohmann::json &tns) { + Dims strides(tns["Strides"].get>()); + size_t nelems = strides.size(); + return nelems * ModelDataT::from_name(tns["DataType"])->bytes(); + }; + + for (auto &task_info : plan["TaskInfos"]) { + for (auto &op : task_info["Ops"]) { + for (auto &tns : op["ReadTensors"]) { + auto buf_info = get_or_create_buffer_info(tns["BufferId"]); + buf_info->bytes = + std::max(buf_info->bytes, tensor_stride_bytes(tns)); + buf_info->is_output = false; + buf_info->tensor_ids.insert(tns["Id"].get()); + buf_info->task_ids.insert(task_info["Id"].get()); + } + for (auto &tns : op["WriteTensors"]) { + auto buf_info = get_or_create_buffer_info(tns["BufferId"]); + buf_info->bytes = + std::max(buf_info->bytes, tensor_stride_bytes(tns)); + buf_info->is_input = false; + buf_info->tensor_ids.insert(tns["Id"].get()); + buf_info->task_ids.insert(task_info["Id"].get()); + } + for (auto &tns : op["ResultTensors"]) { + auto buf_info = get_or_create_buffer_info(tns["BufferId"]); + buf_info->bytes = + std::max(buf_info->bytes, tensor_stride_bytes(tns)); + buf_info->is_input = false; + buf_info->tensor_ids.insert(tns["Id"].get()); + buf_info->task_ids.insert(task_info["Id"].get()); + } + } + } + + // TODO: improve memory planning + size_t offset = 0; + for (auto &kv : buffer_id_to_info_) { + buffer_id_to_offset_[kv.first] = offset; + for (auto &tns_id : kv.second->tensor_ids) { + tensor_id_to_offset_[tns_id] = offset; + } + offset += kv.second->bytes; + } + total_bytes_ = offset; +} + +std::string CodeGenerator::Impl::def_op(const nlohmann::json &op_json, + size_t task_id, size_t op_idx) { + auto op = ModelOp::deserialize(op_json); + auto impl_name = op->impl_name(op_json["Config"]); + auto impl_args = op->impl_args(op_json["Config"]); + std::stringstream ss; + ss << "__forceinline__ __device__ void t" << task_id << "_o" << op_idx + << "("; + size_t arg_idx = 0; + for (auto &arg : impl_args) { + if (arg.type_name() == "TENSOR") { + auto tns = arg.value(); + ss << tns->data_type()->type_str() << "*"; + } else { + ss << arg.type_str(); + } + ss << " _" << arg_idx++ << ", "; + } + ss << "int _idx, int _spw) {\n " << impl_name << "("; + for (size_t i = 0; i < impl_args.size(); ++i) { + ss << "_" << i << ", "; + } + ss << "_idx, _spw);\n}\n"; + return ss.str(); +} + +std::string CodeGenerator::Impl::def_task(const nlohmann::json &task_json) { + std::stringstream ss; + size_t op_idx = 0; + for (auto &op_json : task_json["Ops"]) { + ss << this->def_op(op_json, task_json["Id"], op_idx++); + } + ss << "__noinline__ __device__ void t" << task_json["Id"] + << "(char* _buf, int _idx, int _spw) {\n"; + op_idx = 0; + for (auto &op_json : task_json["Ops"]) { + auto op = ModelOp::deserialize(op_json); + auto impl_args = op->impl_args(op_json["Config"]); + ss << " t" << task_json["Id"] << "_o" << op_idx++ << "("; + for (size_t i = 0; i < impl_args.size(); ++i) { + auto &arg = impl_args[i]; + if (arg.type_name() == "TENSOR") { + auto tns = arg.value(); + ss << "(" << tns->data_type()->type_str() << "*)&_buf[" + << tensor_id_to_offset_[tns->id()] << "]"; + } else { + ss << arg.serialize()[arg.type_name()]; + } + ss << ", "; + } + ss << "_idx, _spw);\n"; + } + ss << "}\n"; + return ss.str(); +} + +std::string CodeGenerator::Impl::task_seq( + size_t proc_b, size_t proc_e, size_t proc_s, size_t proc_cur, size_t task_b, + size_t task_e, size_t task_s, size_t task_gran, size_t num_slots, + size_t slot_num_warps, size_t slot_sram_bytes, size_t task_id) { + std::stringstream ss; + ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", " + << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", " + << task_gran << ", " << num_slots << ", " << slot_num_warps << ", " + << slot_sram_bytes << ", t" << task_id << ">(_buf);\n"; + return ss.str(); +} + +std::string CodeGenerator::Impl::resource_group( + const nlohmann::json &rg_json, const nlohmann::json &task_infos, + const Range &proc_range) { + Range rg_proc_range(rg_json["ProcessorRange"][0], + rg_json["ProcessorRange"][1]); + if (*rg_proc_range.begin() < *proc_range.begin() || + *rg_proc_range.end() > *proc_range.end()) { + ERR(SchedulerError, "invalid processor range of resource group"); + } + Range rg_warp_range(rg_json["WarpRange"][0], + rg_json["WarpRange"][1]); + Range rg_sram_range(rg_json["SramRange"][0], + rg_json["SramRange"][1]); + auto warp_iter = rg_warp_range.begin(); + auto sram_iter = rg_sram_range.begin(); + size_t total_warps = rg_warp_range.size(); + size_t total_sram = rg_sram_range.size(); + size_t proc_cur = *rg_proc_range.begin(); + size_t proc_b = *rg_proc_range.begin(); + size_t proc_e = *rg_proc_range.end(); + size_t proc_s = rg_proc_range.step(); + size_t n_procs = rg_proc_range.size(); + std::stringstream ss; + for (auto &tg : rg_json["TaskGroups"]) { + size_t task_id = tg["TaskId"]; + auto &task_info = task_infos[task_id]; + Range task_range(tg["TaskRange"][0], tg["TaskRange"][1]); + size_t task_gran = tg["Granularity"]; + size_t num_warps_per_task = task_info["NumWarps"]; + size_t sram_bytes_per_task = task_info["SramBytes"]; + // number of concurrent tasks per processor + size_t n_slots; + if (sram_bytes_per_task > 0) { + n_slots = std::min(total_warps / num_warps_per_task, + total_sram / sram_bytes_per_task); + } else { + n_slots = total_warps / num_warps_per_task; + } + if (n_slots == 0) { + ERR(SchedulerError, "not enough resources for task group"); + } + + size_t task_b = *task_range.begin(); + size_t task_e = *task_range.end(); + size_t task_s = task_range.step(); + size_t n_tasks = task_range.size(); + + size_t slot_n_warps = num_warps_per_task; + size_t slot_n_sram = total_sram / n_slots; + size_t sram_per_warp = slot_n_sram / slot_n_warps; + + // + // Distribute tasks to processors. + // + // A sequence [b, e, s] means the range starts from `b`, ends at + // `e - 1`, and the step size is `s`. + // + // Processor ID sequence: [proc_b, proc_e, proc_s], total `n_procs` + // Task ID sequence: [task_b, task_e, task_s], total `n_tasks` + // + // The distribution starts from the processor ID `proc_cur` and wraps + // around (`proc_cur - proc_b` is always a multiple of `proc_s`). + // If `task_gran` is 1, the distribution is round-robin; otherwise, + // the distribution assigns `task_gran` consequent tasks to each + // processor, as long as there are enough tasks. + // We distribute tasks from smaller task IDs to larger task IDs. + // Therefore, the `t`-th assigned task ID of the processor ID + // `(proc_cur + proc_s*p)%n_procs` is (p in range [0, n_procs-1]): + // + // ``` + // task_b + task_s*( + // p*task_gran + + // t/task_gran*task_gran*n_procs + + // t%task_gran + // ) + // ``` + // + // where the division is integer division. + // + // Within a single processor, `n_slots` consequent tasks are + // distributed to warps and SRAMs. Specifically, say that + // "k-th slot" refers to the set of warps `k * slot_n_warps` ~ + // `(k+1) * slot_n_warps - 1` and SRAMs `k * slot_n_sram` ~ + // `(k+1) * slot_n_sram - 1`, then the `t`-th task is assigned to + // the `t%n_slots`-th slot. + // + // Therefore, the `i`-th assigned task ID of the processor ID + // `(proc_cur + p)%n_procs` and the `k`-th slot is (p in range + // [0, n_procs-1], k in range [0, n_slots-1]) the same as the above + // formula with `t` replaced by `k + i*n_slots`: + // + // ``` + // task_b + task_s*( + // p*task_gran + + // (k + i*n_slots)/task_gran*task_gran*n_procs + + // (k + i*n_slots)%task_gran + // ) + // ``` + // + // The corresponding CUDA code is generated as follows, saying that + // `blockIdx.x` is the processor ID: + // + // ``` + // if ((blockIdx.x >= proc_b) && + // (blockIdx.x < proc_e) && + // ((blockIdx.x - proc_b) % proc_s == 0)) { + // size_t p = ((blockIdx.x + gridDim.x - proc_cur) % gridDim.x) / + // proc_s; size_t k = threadIdx.x / warp_size / slot_n_warps; size_t + // task_id_base = task_b + task_s*p*task_gran; for (size_t t = k; ; t + // += n_slots) { + // size_t task_id = task_id_base + task_s*( + // t/task_gran*task_gran*n_procs + t%task_gran + // ); + // if (task_id >= task_e) break; + // task_func(_buf, task_id, sram_per_warp); + // } + // __syncthreads(); + // } + // ``` + ss << " "; + ss << this->task_seq(proc_b, proc_e, proc_s, proc_cur, task_b, task_e, + task_s, task_gran, n_slots, slot_n_warps, + slot_n_sram, task_id); + } + return ss.str(); +} + +CodeGenerator::CodeGenerator(const std::string &plan, const std::string &name) + : impl_(std::make_shared(plan, name)) {} + +std::string CodeGenerator::code() const { return impl_->code_; } + +size_t CodeGenerator::num_procs() const { return impl_->num_procs_; } + +size_t CodeGenerator::num_warps_per_proc() const { + return impl_->num_warps_per_proc_; +} + +size_t CodeGenerator::total_memory_bytes() const { return impl_->total_bytes_; } + +} // namespace ark diff --git a/ark/codegen/codegen.hpp b/ark/codegen/codegen.hpp new file mode 100644 index 000000000..387afd645 --- /dev/null +++ b/ark/codegen/codegen.hpp @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_CODEGEN_HPP_ +#define ARK_CODEGEN_HPP_ + +#include +#include + +namespace ark { + +class CodeGenerator { + public: + CodeGenerator(const std::string &plan, + const std::string &name = "ark_kernel"); + + ~CodeGenerator() = default; + + std::string code() const; + + size_t num_procs() const; + + size_t num_warps_per_proc() const; + + size_t total_memory_bytes() const; + + private: + class Impl; + std::shared_ptr impl_; +}; + +} // namespace ark + +#endif // ARK_CODEGEN_HPP_ diff --git a/ark/cpu_timer.cc b/ark/cpu_timer.cpp similarity index 100% rename from ark/cpu_timer.cc rename to ark/cpu_timer.cpp diff --git a/ark/dims.cc b/ark/dims.cpp similarity index 73% rename from ark/dims.cc rename to ark/dims.cpp index 385f58b28..bb57ea27d 100644 --- a/ark/dims.cc +++ b/ark/dims.cpp @@ -1,19 +1,22 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include "ark/dims.hpp" + #include -#include "include/ark.h" +#include "error.hpp" #include "logging.h" +#include "nlohmann/json.hpp" namespace ark { // Construct with given four dimensions. Dims::Dims(DimType d0, DimType d1, DimType d2, DimType d3) { - this->data[0] = d0; - this->data[1] = d1; - this->data[2] = d2; - this->data[3] = d3; + data_[0] = d0; + data_[1] = d1; + data_[2] = d2; + data_[3] = d3; if (this->is_invalid()) { ERR(InvalidUsageError, "invalid dims given: <", d0, ", ", d1, ", ", d2, ", ", d3, ">"); @@ -26,7 +29,7 @@ Dims::Dims(const Dims &dims_) { ERR(InvalidUsageError, "invalid dims given"); } for (int i = 0; i < DIMS_LEN; ++i) { - this->data[i] = dims_.data[i]; + data_[i] = dims_.data_[i]; } } @@ -51,25 +54,24 @@ Dims::Dims(const std::vector &vec) { } else if (v < 0) { invalid_seen = true; } - this->data[i] = v; + data_[i] = v; } for (; i < DIMS_LEN; ++i) { - this->data[i] = NO_DIM; + data_[i] = NO_DIM; } } // Return the volume of dimensions. If the dimensions are invalid, return -1. DimType Dims::size() const { - const DimType *v = this->data; - if (v[0] == NO_DIM) { + if (data_[0] == NO_DIM) { return -1; } - DimType ret = v[0]; + DimType ret = data_[0]; for (int i = 1; i < DIMS_LEN; ++i) { - if (v[i] == NO_DIM) { + if (data_[i] == NO_DIM) { break; } else { - ret *= v[i]; + ret *= data_[i]; } } return ret; @@ -77,10 +79,9 @@ DimType Dims::size() const { // Return the number of valid dimensions. int Dims::ndims() const { - const DimType *v = this->data; int ret = 0; for (; ret < DIMS_LEN; ++ret) { - if (v[ret] == NO_DIM) { + if (data_[ret] == NO_DIM) { break; } } @@ -89,23 +90,33 @@ int Dims::ndims() const { // Return a new Dims object with 4 valid dimensions by prepending 1s. Dims Dims::dims4() const { - const DimType *v = this->data; int nd = this->ndims(); Dims ret; for (int i = 0; i < DIMS_LEN - nd; ++i) { - ret.data[i] = 1; + ret.data_[i] = 1; } for (int i = 0; i < nd; ++i) { - ret.data[DIMS_LEN - nd + i] = v[i]; + ret.data_[DIMS_LEN - nd + i] = data_[i]; } return ret; } +// Return true if all valid dimensions are zero. +bool Dims::is_zeros() const { + if (this->is_invalid()) { + return false; + } + for (int i = 0; i < DIMS_LEN; ++i) { + if (data_[i] == NO_DIM) break; + if (data_[i] != 0) return false; + } + return true; +} + // Return true if the dimensions are empty. bool Dims::is_no_dim() const { - const DimType *v = this->data; for (int i = 0; i < DIMS_LEN; ++i) { - if (v[i] != NO_DIM) { + if (data_[i] != NO_DIM) { return false; } } @@ -116,16 +127,15 @@ bool Dims::is_no_dim() const { bool Dims::is_invalid() const { // NO_DIM should not appear before a valid dimension. bool invalid_seen = false; - const DimType *v = this->data; for (int i = 0; i < DIMS_LEN; ++i) { if (invalid_seen) { - if (v[i] != NO_DIM) { + if (data_[i] != NO_DIM) { return true; } } else { - if (v[i] == NO_DIM) { + if (data_[i] == NO_DIM) { invalid_seen = true; - } else if (v[i] < 0) { + } else if (data_[i] < 0) { return true; } } @@ -133,6 +143,17 @@ bool Dims::is_invalid() const { return false; } +std::vector Dims::vector() const { + std::vector ret; + for (int i = 0; i < DIMS_LEN; ++i) { + if (data_[i] == NO_DIM) { + break; + } + ret.push_back(data_[i]); + } + return ret; +} + void Dims::insert(int idx, DimType dim) { int nd = this->ndims(); if (nd >= DIMS_LEN) { @@ -145,9 +166,9 @@ void Dims::insert(int idx, DimType dim) { idx += nd + 1; } for (int i = nd; i > idx; --i) { - this->data[i] = this->data[i - 1]; + data_[i] = data_[i - 1]; } - this->data[idx] = dim; + data_[idx] = dim; } DimType Dims::erase(int idx) { @@ -158,20 +179,14 @@ DimType Dims::erase(int idx) { if (idx < 0) { idx += nd; } - DimType ret = this->data[idx]; + DimType ret = data_[idx]; for (int i = idx; i < nd - 1; ++i) { - this->data[i] = this->data[i + 1]; + data_[i] = data_[i + 1]; } - this->data[nd - 1] = NO_DIM; + data_[nd - 1] = NO_DIM; return ret; } -std::string Dims::serialize() const { - std::stringstream ss; - ss << *this; - return ss.str(); -} - DimType &Dims::operator[](int idx) { int nd = this->ndims(); if (idx >= nd || -idx > nd) { @@ -180,7 +195,7 @@ DimType &Dims::operator[](int idx) { if (idx < 0) { idx += nd; } - return this->data[idx]; + return data_[idx]; } const DimType &Dims::operator[](int idx) const { @@ -191,12 +206,12 @@ const DimType &Dims::operator[](int idx) const { if (idx < 0) { idx += nd; } - return this->data[idx]; + return data_[idx]; } bool operator==(const Dims &a, const Dims &b) { for (int i = 0; i < DIMS_LEN; ++i) { - if (a.data[i] != b.data[i]) { + if (a.data_[i] != b.data_[i]) { return false; } } @@ -209,14 +224,12 @@ std::ostream &operator<<(std::ostream &os, const Dims &dims) { if (dims.is_invalid()) { ERR(InvalidUsageError, "invalid dims given"); } - os << '<'; - if (dims.data[0] != NO_DIM) { - os << dims.data[0]; - for (int i = 1; i < DIMS_LEN; ++i) { - if (dims.data[i] == NO_DIM) { - break; - } - os << ", " << dims.data[i]; + int ndims = dims.ndims(); + os << "<"; + if (ndims > 0) { + os << dims[0]; + for (int i = 1; i < ndims; ++i) { + os << ", " << dims[i]; } } os << '>'; diff --git a/ark/dims_test.cc b/ark/dims_test.cpp similarity index 99% rename from ark/dims_test.cc rename to ark/dims_test.cpp index f7cfb819f..a4e4aa087 100644 --- a/ark/dims_test.cc +++ b/ark/dims_test.cpp @@ -1,9 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include "ark/dims.hpp" + #include -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_dims_basic() { @@ -180,7 +181,6 @@ ark::unittest::State test_dims_ostream() { } int main() { - ark::init(); UNITTEST(test_dims_basic); UNITTEST(test_dims_no_dim); UNITTEST(test_dims_zero); diff --git a/ark/env.cc b/ark/env.cpp similarity index 94% rename from ark/env.cc rename to ark/env.cpp index 3cf90c2a1..3cdcb36fd 100644 --- a/ark/env.cc +++ b/ark/env.cpp @@ -19,7 +19,7 @@ #define DEFAULT_ARK_DISABLE_GRAPH_OPT false #define DEFAULT_ARK_IGNORE_BINARY_CACHE false #define DEFAULT_ARK_SHM_NAME_PREFIX "ark." -#define DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH "" +#define DEFAULT_ARK_ENFORCE_PLAN_PATH "" #define DEFAULT_ARK_MSCCLPP_PORT 50051 template @@ -75,8 +75,8 @@ Env::Env() { this->shm_name_prefix = env("ARK_SHM_NAME_PREFIX", DEFAULT_ARK_SHM_NAME_PREFIX); // - this->enforce_kernel_code_path = env( - "ARK_ENFORCE_KERNEL_CODE_PATH", DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH); + this->enforce_plan_path = env("ARK_ENFORCE_PLAN_PATH", + DEFAULT_ARK_ENFORCE_PLAN_PATH); // Get the port number of MSCCLPP. this->mscclpp_port = env("ARK_MSCCLPP_PORT", DEFAULT_ARK_MSCCLPP_PORT); } diff --git a/ark/env.h b/ark/env.h index 677b4eaa4..2b86704e6 100644 --- a/ark/env.h +++ b/ark/env.h @@ -35,8 +35,8 @@ struct Env { bool ignore_binary_cache; // Prefix of shared memory file names. std::string shm_name_prefix; - // Enforce to compile a specific kernel code file. - std::string enforce_kernel_code_path; + // Enforce to compile a specific plan file. + std::string enforce_plan_path; // MSCCL++ bootstrap port. int mscclpp_port; }; diff --git a/ark/error.hpp b/ark/error.hpp new file mode 100644 index 000000000..6e7b77024 --- /dev/null +++ b/ark/error.hpp @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_ERROR_HPP_ +#define ARK_ERROR_HPP_ + +#include +#include + +namespace ark { + +#define REGISTER_ERROR_TYPE(_name) \ + class _name : public std::runtime_error { \ + public: \ + _name(const std::string &msg) : std::runtime_error(msg) {} \ + }; + +REGISTER_ERROR_TYPE(InvalidUsageError) +REGISTER_ERROR_TYPE(ModelError) +REGISTER_ERROR_TYPE(SchedulerError) +REGISTER_ERROR_TYPE(ExecutorError) +REGISTER_ERROR_TYPE(SystemError) +REGISTER_ERROR_TYPE(GpuError) +REGISTER_ERROR_TYPE(RuntimeError) +REGISTER_ERROR_TYPE(UnitTestError) + +} // namespace ark + +#endif // ARK_ERROR_HPP_ diff --git a/ark/executor.cc b/ark/executor.cc index f9487dd71..a1805e9d7 100644 --- a/ark/executor.cc +++ b/ark/executor.cc @@ -34,7 +34,6 @@ class Executor::Impl { std::shared_ptr ctx_; std::unique_ptr sched_; std::unique_ptr glk_; - std::shared_ptr stream_; }; Executor::Impl::Impl(int rank, int world_size, Model &model, @@ -47,7 +46,6 @@ Executor::Impl::Impl(int rank, int world_size, Model &model, sched_->schedule(); ctx_ = sched_->create_context(); const GpuManager::Info &ginfo = ctx_->get_gpu_manager()->info(); - stream_ = ctx_->get_gpu_manager()->create_stream(); glk_ = std::make_unique( ctx_, name, sched_->gen_code(), ginfo.num_sm, num_warps_per_sm, (unsigned int)ginfo.smem_block_total); diff --git a/ark/executor/executor.cpp b/ark/executor/executor.cpp new file mode 100644 index 000000000..03031f06f --- /dev/null +++ b/ark/executor/executor.cpp @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/executor.hpp" + +#include "codegen/codegen.hpp" +#include "env.h" +#include "file_io.h" +#include "gpu/gpu_loop_kernel.h" +#include "logging.h" + +namespace ark { + +class Executor::Impl { + public: + Impl(int rank, int world_size, const std::string &plan, + const std::string &name); + ~Impl() = default; + + void compile(); + void launch(); + void run(int iter); + void wait(); + float stop(); + + private: + const int rank_; + const int world_size_; + int gpu_id_; + + std::shared_ptr ctx_; + std::unique_ptr glk_; +}; + +Executor::Impl::Impl(int rank, int world_size, const std::string &plan, + const std::string &name) + : rank_(rank), world_size_(world_size) { + gpu_id_ = rank_ % get_env().num_ranks_per_host; + auto gpu_mgr = GpuManager::get_instance(gpu_id_); + size_t smem_block_total = + static_cast(gpu_mgr->info().smem_block_total); + + auto &plan_path = get_env().enforce_plan_path; + if (!plan_path.empty()) { + LOG(INFO, "Enforce executor plan path: ", plan_path); + glk_ = std::make_unique(gpu_id_, read_file(plan_path), + name, smem_block_total); + } else { + glk_ = std::make_unique(gpu_id_, plan, name, + smem_block_total); + } +} + +void Executor::Impl::compile() { glk_->compile(); } + +void Executor::Impl::launch() { + glk_->load(); + glk_->launch(false); +} + +void Executor::Impl::run(int iter) { glk_->run(iter); } + +void Executor::Impl::wait() { glk_->wait(); } + +float Executor::Impl::stop() { + glk_->stop(); + return glk_->get_elapsed_msec(); +} + +Executor::Executor(int rank, int world_size, const std::string &plan, + const std::string &name) + : impl_(std::make_unique(rank, world_size, plan, name)) {} + +Executor::~Executor() = default; + +void Executor::compile() { impl_->compile(); } + +void Executor::launch() { impl_->launch(); } + +void Executor::run(int iter) { impl_->run(iter); } + +void Executor::wait() { impl_->wait(); } + +float Executor::stop() { return impl_->stop(); } + +} // namespace ark diff --git a/ark/executor/executor_test.cpp b/ark/executor/executor_test.cpp new file mode 100644 index 000000000..0ab06bc43 --- /dev/null +++ b/ark/executor/executor_test.cpp @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/executor.hpp" + +#include "ark/model.hpp" +#include "codegen/codegen.hpp" +#include "gpu/gpu_context.h" +#include "model/model_data_type.hpp" +#include "nlohmann/json.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_executor_scale() { + ark::Model m; + ark::ModelTensorRef input = m.tensor({32}, ark::FP32); + ark::ModelTensorRef output = m.scale(input, 0.7); + + auto comp = m.compress(); + auto serialized = comp.serialize(2); + UNITTEST_LOG(serialized); + + auto comp_json = nlohmann::json::parse(serialized); + std::map buf_id_to_bytes; + for (auto &tns : comp_json["Tensors"]) { + size_t nelems; + if (tns.contains("Strides")) { + nelems = + ark::Dims(std::vector(tns["Strides"])).size(); + } else { + nelems = ark::Dims(std::vector(tns["Shape"])).size(); + UNITTEST_LOG("Shape: ", tns["Shape"].dump(), " ? ", nelems); + } + size_t bytes = + nelems * ark::ModelDataT::from_name(tns["DataType"])->bytes(); + if (buf_id_to_bytes.find(tns["BufferId"]) != buf_id_to_bytes.end()) { + buf_id_to_bytes[tns["BufferId"]] = + std::max(buf_id_to_bytes[tns["BufferId"]], bytes); + } else { + buf_id_to_bytes[tns["BufferId"]] = bytes; + } + } + + nlohmann::json j; + j["NumProcessors"] = 1; + j["NumWarpsPerProcessor"] = 1; + + j["TaskInfos"] = {nlohmann::json()}; + j["TaskInfos"][0]["Id"] = 0; + j["TaskInfos"][0]["NumWarps"] = 1; + j["TaskInfos"][0]["SramBytes"] = 0; + j["TaskInfos"][0]["Ops"] = {nlohmann::json()}; + j["TaskInfos"][0]["Ops"][0]["Type"] = "Scale"; + j["TaskInfos"][0]["Ops"][0]["Name"] = "scale"; + j["TaskInfos"][0]["Ops"][0]["IsVirtual"] = false; + j["TaskInfos"][0]["Ops"][0]["ReadTensors"] = {comp_json["Tensors"][0]}; + j["TaskInfos"][0]["Ops"][0]["WriteTensors"] = {comp_json["Tensors"][1]}; + j["TaskInfos"][0]["Ops"][0]["ResultTensors"] = {comp_json["Tensors"][2]}; + j["TaskInfos"][0]["Ops"][0]["Args"] = { + {"Factor", {"FLOAT", 0.699999988079071}}}; + j["TaskInfos"][0]["Ops"][0]["Config"] = nlohmann::json(); + j["TaskInfos"][0]["Ops"][0]["Config"]["NumWarps"] = 1; + j["TaskInfos"][0]["Ops"][0]["Config"]["Tile"] = {1, 32}; + + j["ProcessorGroups"] = {nlohmann::json()}; + j["ProcessorGroups"][0]["ProcessorRange"] = {0, 1}; + j["ProcessorGroups"][0]["ResourceGroups"] = {nlohmann::json()}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["ProcessorRange"] = {0, 1}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["WarpRange"] = {0, 1}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["SramRange"] = {0, 1}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"] = { + nlohmann::json()}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0]["TaskId"] = 0; + j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0]["TaskRange"] = + {0, 1}; + j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0] + ["Granularity"] = 1; + + auto ctx = ark::GpuContext::get_context(0, 1); + std::map> buf_id_to_buf; + for (auto &kv : buf_id_to_bytes) { + auto buf = ctx->allocate_buffer(kv.second, 1); + buf_id_to_buf[kv.first] = buf; + UNITTEST_LOG("Allocated buffer ", kv.first, ": offset ", + buf->get_offset(), ", bytes ", buf->get_bytes(), " ? ", + kv.second); + } + ctx->freeze(); + + std::map tns_id_to_offset; + for (auto &tns : comp_json["Tensors"]) { + auto buf = buf_id_to_buf[tns["BufferId"]]; + auto offset = buf->get_offset(); + tns_id_to_offset[tns["Id"]] = offset; + UNITTEST_LOG("Tensor ", tns["Id"], ": offset ", offset); + } + + j["Context"] = nlohmann::json(); + j["Context"]["TensorIdToOffset"] = nlohmann::json(); + for (auto &kv : tns_id_to_offset) { + j["Context"]["TensorIdToOffset"][std::to_string(kv.first)] = kv.second; + } + + UNITTEST_LOG(j.dump(2)); + + ark::Executor exe(0, 1, j.dump(), "executor_test"); + // ark::CodeGenerator codegen(j.dump()); + // UNITTEST_LOG(codegen.code()); + exe.compile(); + exe.launch(); + exe.run(1); + exe.stop(); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_executor_scale); + return 0; +} diff --git a/ark/file_io.cc b/ark/file_io.cpp similarity index 98% rename from ark/file_io.cc rename to ark/file_io.cpp index d564d97fb..76bb9983e 100644 --- a/ark/file_io.cc +++ b/ark/file_io.cpp @@ -7,7 +7,6 @@ #include #include -#include "include/ark.h" #include "logging.h" namespace fs = std::filesystem; diff --git a/ark/file_io_test.cc b/ark/file_io_test.cpp similarity index 98% rename from ark/file_io_test.cc rename to ark/file_io_test.cpp index 9b69c0cde..3e56f3a42 100644 --- a/ark/file_io_test.cc +++ b/ark/file_io_test.cpp @@ -7,7 +7,6 @@ #include #include "env.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_is_exist() { @@ -101,7 +100,6 @@ ark::unittest::State test_read_write_file() { } int main() { - ark::init(); UNITTEST(test_is_exist); UNITTEST(test_is_dir); UNITTEST(test_is_file); diff --git a/ark/gpu/gpu.h b/ark/gpu/gpu.h index 2f1eba3ba..1d117e939 100644 --- a/ark/gpu/gpu.h +++ b/ark/gpu/gpu.h @@ -21,7 +21,7 @@ constexpr auto alias = cuda_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } @@ -35,7 +35,7 @@ constexpr auto alias = rocm_const; #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return rocm_func(std::forward(args)...); \ } diff --git a/ark/gpu/gpu_buffer.cc b/ark/gpu/gpu_buffer.cpp similarity index 100% rename from ark/gpu/gpu_buffer.cc rename to ark/gpu/gpu_buffer.cpp diff --git a/ark/gpu/gpu_comm_sw.cc b/ark/gpu/gpu_comm_sw.cpp similarity index 99% rename from ark/gpu/gpu_comm_sw.cc rename to ark/gpu/gpu_comm_sw.cpp index 209a02c8b..5aa85a346 100644 --- a/ark/gpu/gpu_comm_sw.cc +++ b/ark/gpu/gpu_comm_sw.cpp @@ -13,7 +13,6 @@ #include "env.h" #include "gpu/gpu_logging.h" #include "gpu/gpu_manager.h" -#include "include/ark.h" #include "ipc/ipc_hosts.h" #include "ipc/ipc_socket.h" diff --git a/ark/gpu/gpu_compile.cc b/ark/gpu/gpu_compile.cpp similarity index 99% rename from ark/gpu/gpu_compile.cc rename to ark/gpu/gpu_compile.cpp index 3a5911870..88eb56e98 100644 --- a/ark/gpu/gpu_compile.cc +++ b/ark/gpu/gpu_compile.cpp @@ -18,12 +18,11 @@ #include #include +#include "ark/random.hpp" #include "cpu_timer.h" #include "env.h" #include "file_io.h" #include "gpu/gpu_logging.h" -#include "include/ark.h" -#include "random.h" #define ARK_DEBUG_KERNEL 0 diff --git a/ark/gpu/gpu_context.cc b/ark/gpu/gpu_context.cpp similarity index 100% rename from ark/gpu/gpu_context.cc rename to ark/gpu/gpu_context.cpp diff --git a/ark/gpu/gpu_context_test.cc b/ark/gpu/gpu_context_test.cpp similarity index 99% rename from ark/gpu/gpu_context_test.cc rename to ark/gpu/gpu_context_test.cpp index 27f796494..737353d1d 100644 --- a/ark/gpu/gpu_context_test.cc +++ b/ark/gpu/gpu_context_test.cpp @@ -5,7 +5,6 @@ #include -#include "include/ark.h" #include "unittest/unittest_utils.h" // Test initializing and destroying GpuContext @@ -178,7 +177,6 @@ ark::unittest::State test_gpu_context_remote() { } int main() { - ark::init(); UNITTEST(test_gpu_context_basic); UNITTEST(test_gpu_context_buffer_free); UNITTEST(test_gpu_context_buffer_alloc); diff --git a/ark/gpu/gpu_event.cc b/ark/gpu/gpu_event.cpp similarity index 100% rename from ark/gpu/gpu_event.cc rename to ark/gpu/gpu_event.cpp diff --git a/ark/gpu/gpu_kernel.cc b/ark/gpu/gpu_kernel.cc deleted file mode 100644 index 4b99bbfcf..000000000 --- a/ark/gpu/gpu_kernel.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "gpu/gpu_kernel.h" - -#include -#include - -#include "gpu/gpu.h" -#include "gpu/gpu_compile.h" -#include "gpu/gpu_logging.h" - -namespace ark { - -GpuKernel::GpuKernel( - std::shared_ptr ctx, const std::string& codes, - const std::array& block_dim, const std::array& grid_dim, - size_t smem_bytes, const std::string& kernel_name, - std::initializer_list, size_t>> args) - : ctx_(ctx), - codes_(codes), - block_dim_(block_dim), - grid_dim_(grid_dim), - smem_bytes_(smem_bytes), - kernel_name_(kernel_name), - params_ptr_(args.size(), nullptr), - args_(args.size(), nullptr) { - if (kernel_name_.size() == 0) { - ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_); - } - int idx = 0; - for (auto& pair : args) { - std::shared_ptr ptr = - std::shared_ptr(new uint8_t[pair.second]); - assert(ptr != nullptr); - if (pair.first != nullptr) { - std::memcpy(ptr.get(), pair.first.get(), pair.second); - } - // make sure the shared_ptr is not released - this->args_[idx] = ptr; - this->params_ptr_[idx++] = ptr.get(); - } -} - -void GpuKernel::compile() { - auto manager = ctx_->get_gpu_manager(); - int max_reg_per_block = manager->info().max_registers_per_block; - int max_reg_per_thread = manager->info().max_registers_per_thread; - int max_reg_cnt = - max_reg_per_block / (block_dim_[0] * block_dim_[1] * block_dim_[2]); - if (max_reg_cnt >= max_reg_per_thread) { - max_reg_cnt = max_reg_per_thread - 1; - } - bin_ = gpu_compile({codes_}, manager->info().arch, max_reg_cnt); - GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str())); - GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str())); - - int static_smem_size_bytes; - GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes, - gpuFuncAttributeSharedSizeBytes, function_)); - int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes; - GLOG_DRV(gpuFuncSetAttribute(function_, - gpuFuncAttributeMaxDynamicSharedSizeBytes, - dynamic_smem_size_bytes)); -} - -void GpuKernel::launch(std::shared_ptr stream) { - if (!this->is_compiled()) { - ERR(InvalidUsageError, "Kernel is not compiled yet."); - } - ctx_->get_gpu_manager()->launch(function_, grid_dim_, block_dim_, - smem_bytes_, stream, - this->params_ptr_.data(), nullptr); -} - -} // namespace ark diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp new file mode 100644 index 000000000..743322d3a --- /dev/null +++ b/ark/gpu/gpu_kernel.cpp @@ -0,0 +1,78 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "gpu/gpu_kernel.h" + +#include +#include + +#include "gpu/gpu.h" +#include "gpu/gpu_compile.h" +#include "gpu/gpu_logging.h" + +namespace ark { + +GpuKernel::GpuKernel(int gpu_id, const std::string& code, + const std::array& block_dim, + const std::array& grid_dim, size_t smem_bytes, + const std::string& kernel_name, + std::initializer_list> args) { + this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name, + args); +} + +void GpuKernel::init(int gpu_id, const std::string& code, + const std::array& block_dim, + const std::array& grid_dim, size_t smem_bytes, + const std::string& kernel_name, + std::initializer_list> args) { + gpu_manager_ = GpuManager::get_instance(gpu_id); + code_ = code; + block_dim_ = block_dim; + grid_dim_ = grid_dim; + smem_bytes_ = smem_bytes; + kernel_name_ = kernel_name; + params_ptr_.resize(args.size()); + args_.resize(args.size()); + if (kernel_name_.size() == 0) { + ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_); + } + size_t idx = 0; + for (auto& pair : args) { + args_[idx].reset(new uint8_t[pair.second]); + std::memcpy(args_[idx].get(), &(pair.first), pair.second); + params_ptr_[idx] = static_cast(args_[idx].get()); + idx++; + } +} + +void GpuKernel::compile() { + int max_reg_per_block = gpu_manager_->info().max_registers_per_block; + int max_reg_per_thread = gpu_manager_->info().max_registers_per_thread; + int max_reg_cnt = + max_reg_per_block / (block_dim_[0] * block_dim_[1] * block_dim_[2]); + if (max_reg_cnt >= max_reg_per_thread) { + max_reg_cnt = max_reg_per_thread - 1; + } + bin_ = gpu_compile({code_}, gpu_manager_->info().arch, max_reg_cnt); + GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str())); + GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str())); + + int static_smem_size_bytes; + GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes, + gpuFuncAttributeSharedSizeBytes, function_)); + int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes; + GLOG_DRV(gpuFuncSetAttribute(function_, + gpuFuncAttributeMaxDynamicSharedSizeBytes, + dynamic_smem_size_bytes)); +} + +void GpuKernel::launch(std::shared_ptr stream) { + if (!this->is_compiled()) { + ERR(InvalidUsageError, "Kernel is not compiled yet."); + } + gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream, + params_ptr_.data(), nullptr); +} + +} // namespace ark diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h index 18149fc3f..ab08c4156 100644 --- a/ark/gpu/gpu_kernel.h +++ b/ark/gpu/gpu_kernel.h @@ -14,19 +14,24 @@ namespace ark { class GpuKernel { public: - GpuKernel(std::shared_ptr ctx, const std::string& codes, + GpuKernel() {} + GpuKernel(int gpu_id, const std::string& codes, const std::array& block_dim, const std::array& grid_dim, size_t smem_bytes, const std::string& kernel_name, - std::initializer_list, size_t>> - args = {}); + std::initializer_list> args = {}); + void init(int gpu_id, const std::string& codes, + const std::array& block_dim, + const std::array& grid_dim, size_t smem_bytes, + const std::string& kernel_name, + std::initializer_list> args = {}); void compile(); void launch(std::shared_ptr stream); protected: - std::shared_ptr ctx_; - std::string codes_; + std::shared_ptr gpu_manager_; + std::string code_; std::array block_dim_; std::array grid_dim_; int smem_bytes_; @@ -35,7 +40,7 @@ class GpuKernel { gpuModule module_; gpuFunction function_ = nullptr; std::vector params_ptr_; - std::vector> args_; + std::vector> args_; bool is_compiled() const { return function_ != nullptr; } }; diff --git a/ark/gpu/gpu_kernel_test.cc b/ark/gpu/gpu_kernel_test.cc deleted file mode 100644 index 3230f6326..000000000 --- a/ark/gpu/gpu_kernel_test.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "gpu/gpu_kernel.h" - -#include "gpu/gpu_loop_kernel.h" -#include "include/ark.h" -#include "unittest/unittest_utils.h" - -const std::string void_kernel = "extern \"C\" __global__ void kernel() {}"; - -ark::unittest::State test_gpu_kernel() { - auto ctx = ark::GpuContext::get_context(0, 1); - ark::GpuKernel kernel(ctx, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel"); - kernel.compile(); - return ark::unittest::SUCCESS; -} - -// -const std::string test_kernel_loop_void = - "__device__ void ark_loop_body(char *_buf, int _iter) {\n" - " // Do nothing. Print iteration counter.\n" - " if (threadIdx.x == 0 && blockIdx.x == 0) {\n" - " if (_iter % 50 == 49) {\n" - " printf(\".\\n\");\n" - " } else {\n" - " printf(\".\");\n" - " }\n" - " }\n" - "}\n"; - -ark::unittest::State test_gpu_loop_kernel() { - auto ctx = ark::GpuContext::get_context(0, 1); - ctx->freeze(); - - ark::GpuLoopKernel glk{ctx, - "test_kernel_loop_void", - {test_kernel_loop_void}, - ctx->get_gpu_manager()->info().num_sm, - 1, - 0}; - glk.compile(); - glk.load(); - - glk.launch(ctx->get_gpu_manager()->create_stream()); - glk.run(100); - glk.stop(); - - return ark::unittest::SUCCESS; -} - -int main() { - ark::init(); - UNITTEST(test_gpu_kernel); - UNITTEST(test_gpu_loop_kernel); - return 0; -} diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp new file mode 100644 index 000000000..7e88db966 --- /dev/null +++ b/ark/gpu/gpu_kernel_test.cpp @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "gpu/gpu_kernel.h" + +#include "gpu/gpu_loop_kernel.h" +#include "unittest/unittest_utils.h" + +const std::string void_kernel = "extern \"C\" __global__ void kernel() {}"; + +ark::unittest::State test_gpu_kernel() { + ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel"); + kernel.compile(); + return ark::unittest::SUCCESS; +} + +// +// const std::string test_kernel_loop_void = +// "__device__ void ark_loop_body(char *_buf, int _iter) {\n" +// " // Do nothing. Print iteration counter.\n" +// " if (threadIdx.x == 0 && blockIdx.x == 0) {\n" +// " if (_iter % 50 == 49) {\n" +// " printf(\".\\n\");\n" +// " } else {\n" +// " printf(\".\");\n" +// " }\n" +// " }\n" +// "}\n"; + +// ark::unittest::State test_gpu_loop_kernel() { +// int num_sm = ark::GpuManager::get_instance(0)->info().num_sm; +// ark::GpuLoopKernel glk(0, "test_kernel_loop_void", test_kernel_loop_void, +// static_cast(num_sm), 1, 0, 0); +// glk.compile(); +// glk.load(); + +// glk.launch(); +// glk.run(100); +// glk.stop(); + +// return ark::unittest::SUCCESS; +// } + +int main() { + UNITTEST(test_gpu_kernel); + // UNITTEST(test_gpu_loop_kernel); + return 0; +} diff --git a/ark/gpu/gpu_logging.h b/ark/gpu/gpu_logging.h index ac2a4abc6..b14435b8b 100644 --- a/ark/gpu/gpu_logging.h +++ b/ark/gpu/gpu_logging.h @@ -5,7 +5,6 @@ #define ARK_GPU_LOGGING_H_ #include "gpu/gpu.h" -#include "include/ark.h" #include "logging.h" #define GLOG(cmd) \ diff --git a/ark/gpu/gpu_loop_kernel.cc b/ark/gpu/gpu_loop_kernel.cc deleted file mode 100644 index 791a22b5a..000000000 --- a/ark/gpu/gpu_loop_kernel.cc +++ /dev/null @@ -1,278 +0,0 @@ -#include "gpu/gpu_loop_kernel.h" - -#include - -#include "env.h" -#include "file_io.h" -#include "gpu/gpu.h" -#include "gpu/gpu_event.h" -#include "gpu/gpu_logging.h" - -#define MAX_LOOP_COUNTER 10000000 - -#if defined(ARK_CUDA) -#include -static int atomicLoadRelaxed(int* ptr) { - return cuda::atomic_ref{*ptr}.load( - cuda::memory_order_relaxed); -} -static void atomicStoreRelaxed(int* ptr, int val) { - cuda::atomic_ref{*ptr}.store( - val, cuda::memory_order_relaxed); -} -#elif defined(ARK_ROCM) -static int atomicLoadRelaxed(int* ptr) { - return __atomic_load_n(ptr, __ATOMIC_RELAXED); -} -static void atomicStoreRelaxed(int* ptr, int val) { - __atomic_store_n(ptr, val, __ATOMIC_RELAXED); -} -#endif // defined(ARK_ROCM) - -namespace ark { - -GpuLoopKernel::GpuLoopKernel(std::shared_ptr ctx, - const std::string& name, - const std::vector& codes_body, - int num_sm, int num_warp, unsigned int smem_bytes) - : GpuKernel( - ctx, {}, - {num_warp * ctx->get_gpu_manager()->info().threads_per_warp, 1, 1}, - {num_sm, 1, 1}, (smem_bytes < 4) ? 4 : smem_bytes, name, - {{nullptr, sizeof(GpuPtr)}}), - timer_begin_(ctx->get_gpu_manager()->create_event()), - timer_end_(ctx->get_gpu_manager()->create_event()) { - flag_ = ctx->get_gpu_manager()->malloc_host( - sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined); - *(int**)params_ptr_[0] = (int*)flag_->ref(); - - auto& code_path = get_env().enforce_kernel_code_path; - if (!code_path.empty()) { - LOG(INFO, "Enforce kernel code path: ", code_path); - codes_ = std::move(read_file(code_path)); - } else if (codes_body.size() > 0) { - const std::string* ark_loop_body_code = nullptr; - for (auto& code : codes_body) { - if (code.find("ark_loop_body") != std::string::npos) { - ark_loop_body_code = &code; - break; - } - } - assert(ark_loop_body_code != nullptr); - - std::stringstream ss; - // clang-format off - ss << - "// THIS KERNEL IS MACHINE-GENERATED BY ARK.\n" - "#define ARK_THREADS_PER_BLOCK " << block_dim_[0] << "\n" - "__device__ int _ITER = 0;\n" - "#include \"ark_kernels.h\"\n" - "using namespace ark;\n" - "__device__ sync::State " ARK_LSS_NAME ";\n" - "__device__ char *" ARK_BUF_NAME ";\n" - << *ark_loop_body_code << - "extern \"C\" __global__ __launch_bounds__(" << block_dim_[0] << ", 1)\n" - "void " << kernel_name_ << "(int *_it)\n" - "{\n" - " char *_buf = " ARK_BUF_NAME ";\n" - " int *shared_mem = (int *)_ARK_SMEM;\n" - " for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {\n" - " shared_mem[i] = 0;\n" - " }\n" - " for (;;) {\n" - " if (threadIdx.x == 0 && blockIdx.x == 0) {\n" - " int iter;\n" - " while ((iter = atomicLoadRelaxed(_it)) == 0) {}\n" - " _ITER = iter;\n" - " }\n" - " sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n" - " if (_ITER < 0) {\n" - " return;\n" - " }\n" - " for (int _i = 0; _i < _ITER; ++_i) {\n" - " ark_loop_body(_buf, _i);\n" - " sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n" - " }\n" - " if (threadIdx.x == 0 && blockIdx.x == 0) {\n" - " atomicStoreRelaxed(_it, 0);\n" - " }\n" - " sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n" - " }\n" - "}\n"; - // clang-format on - codes_ = std::move(ss.str()); - } -} - -void GpuLoopKernel::load() { - if (!is_compiled()) { - ERR(InvalidUsageError, "Need to compile first before initialization."); - } - if (stream_ != nullptr) { - // Wait until previous works finish. - wait(); - return; - } - // Initialize global variables in the loop kernel. - std::shared_ptr manager = ctx_->get_gpu_manager(); - void* buf_ptr_val = ctx_->get_data_memory()->ref(); - GpuPtr lss_ptr_addr; - GpuPtr buf_ptr_addr; - size_t tmp = 0; - GLOG_DRV(gpuModuleGetGlobal(&lss_ptr_addr, &tmp, module_, ARK_LSS_NAME)); - GLOG_DRV(gpuModuleGetGlobal(&buf_ptr_addr, &tmp, module_, ARK_BUF_NAME)); - std::array data = {0, 0, 0, 0}; - manager->memcpy_htod((void*)lss_ptr_addr, 0, data.data(), 0, - sizeof(int) * data.size()); - manager->memcpy_htod((void*)buf_ptr_addr, 0, &buf_ptr_val, 0, - sizeof(GpuPtr)); - // TODO: remove this hack - GpuPtr lss_0_ptr_addr; - GpuPtr lss_1_ptr_addr; - gpuDrvError ret = - gpuModuleGetGlobal(&lss_0_ptr_addr, &tmp, module_, ARK_LSS_NAME "_0"); - if (ret == gpuDrvSuccess) { - manager->memcpy_htod((void*)lss_0_ptr_addr, 0, data.data(), 0, - sizeof(int) * data.size()); - } else if (ret != gpuErrorNotFound) { - GLOG_DRV(ret); - } - ret = gpuModuleGetGlobal(&lss_1_ptr_addr, &tmp, module_, ARK_LSS_NAME "_1"); - if (ret == gpuDrvSuccess) { - manager->memcpy_htod((void*)lss_1_ptr_addr, 0, data.data(), 0, - sizeof(int) * data.size()); - } else if (ret != gpuErrorNotFound) { - GLOG_DRV(ret); - } - // set the data buffer pointers of remote gpus - int nrph = get_env().num_ranks_per_host; - int nodes_id = ctx_->gpu_id() / nrph; - // only set the GPU remote data buf pointers of the GPUs on the same node - for (int i = nodes_id * nrph; - i < (nodes_id + 1) * nrph && i < ctx_->world_size(); i++) { - void* data_buf_value = ctx_->get_data_memory(i)->ref(); - if (data_buf_value == 0) { - continue; - } - GpuPtr data_buf_ptr; - std::string data_buf_name = ARK_BUF_NAME + std::to_string(i); - gpuDrvError _e = gpuModuleGetGlobal(&data_buf_ptr, &tmp, module_, - data_buf_name.c_str()); - if (_e == gpuErrorNotFound) { - LOG(DEBUG, "global variable ", data_buf_name, " not found"); - continue; - } - LOG(DEBUG, data_buf_name, " data_buf_ptr=", std::hex, data_buf_ptr, - " data_buf_value=", data_buf_value); - manager->memcpy_htod((void*)data_buf_ptr, 0, &data_buf_value, 0, - sizeof(GpuPtr)); - } - - std::shared_ptr comm = ctx_->get_comm_sw(); - if (comm->get_proxy_channels_num() > 0) { - GpuPtr channel_addr; - GLOG_DRV(gpuModuleGetGlobal(&channel_addr, &tmp, module_, - "_ARK_PROXY_CHANS")); - const void* chans_ref = comm->get_proxy_channels_ref(); - size_t chans_bytes = comm->get_proxy_channels_bytes(); - manager->memcpy_htod((void*)channel_addr, 0, - const_cast(chans_ref), 0, chans_bytes); - } - if (comm->get_sm_channels_num() > 0) { - GpuPtr channel_addr; - GLOG_DRV( - gpuModuleGetGlobal(&channel_addr, &tmp, module_, "_ARK_SM_CHANS")); - const void* chans_ref = comm->get_sm_channels_ref(); - size_t chans_bytes = comm->get_sm_channels_bytes(); - manager->memcpy_htod((void*)channel_addr, 0, - const_cast(chans_ref), 0, chans_bytes); - } -} - -void GpuLoopKernel::launch(std::shared_ptr stream, - bool disable_timing) { - elapsed_msec_ = -1; - if (!is_compiled()) { - ERR(InvalidUsageError, "Need to compile first before initialization."); - } else if (stream == nullptr) { - ERR(InvalidUsageError, "Given an invalid stream."); - } else if (stream_ != nullptr) { - if (stream_ == stream) { - LOG(WARN, "Ignore launching twice."); - return; - } else { - ERR(InvalidUsageError, "This loop kernel is already running."); - } - } - if (!disable_timing) { - timer_begin_->record(stream); - } - - ctx_->get_comm_sw()->launch_request_loop(); - - // Initialize loop flags. - atomicStoreRelaxed(flag_->ref(), 0); - GpuKernel::launch(stream); - stream_ = stream; - if (!disable_timing) { - timer_end_->record(stream); - is_recording_ = true; - } -} - -void GpuLoopKernel::run(int iter) { - if (iter > 0) { - while (atomicLoadRelaxed(flag_->ref()) > 0) { - } - atomicStoreRelaxed(flag_->ref(), iter); - } -} - -bool GpuLoopKernel::poll() { return atomicLoadRelaxed(flag_->ref()) <= 0; } - -void GpuLoopKernel::wait() { - int cnt = MAX_LOOP_COUNTER; - while (atomicLoadRelaxed(flag_->ref()) > 0) { - if (--cnt > 0) { - continue; - } - // Check if the kernel encountered an error. - gpuError res = stream_->query(); - if (res == gpuSuccess) { - if (atomicLoadRelaxed(flag_->ref()) > 0) { - LOG(WARN, "Stream is finished but the loop flag is still set."); - break; - } else { - LOG(WARN, - "wait() is delayed by a stream query. Regarding " - "timing measurements may be inaccurate."); - break; - } - } else if (res == gpuErrorNotReady) { - cnt = MAX_LOOP_COUNTER; - } else { - GLOG(res); - } - } -} - -void GpuLoopKernel::stop() { - wait(); - atomicStoreRelaxed(flag_->ref(), -1); - stream_->sync(); - if (is_recording_) { - elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_); - is_recording_ = false; - } - stream_ = nullptr; - ctx_->get_comm_sw()->stop_request_loop(); -} - -float GpuLoopKernel::get_elapsed_msec() const { - if (is_recording_) { - ERR(InvalidUsageError, "Need to stop the kernel first."); - } - return elapsed_msec_; -} - -} // namespace ark diff --git a/ark/gpu/gpu_loop_kernel.cpp b/ark/gpu/gpu_loop_kernel.cpp new file mode 100644 index 000000000..32610c20b --- /dev/null +++ b/ark/gpu/gpu_loop_kernel.cpp @@ -0,0 +1,246 @@ +#include "gpu/gpu_loop_kernel.h" + +#include + +#include "codegen/codegen.hpp" +#include "env.h" +#include "file_io.h" +#include "gpu/gpu.h" +#include "gpu/gpu_event.h" +#include "gpu/gpu_logging.h" + +#define MAX_LOOP_COUNTER 10000000 + +#if defined(ARK_CUDA) +#include +static int atomicLoadRelaxed(int* ptr) { + return cuda::atomic_ref{*ptr}.load( + cuda::memory_order_relaxed); +} +static void atomicStoreRelaxed(int* ptr, int val) { + cuda::atomic_ref{*ptr}.store( + val, cuda::memory_order_relaxed); +} +#elif defined(ARK_ROCM) +static int atomicLoadRelaxed(int* ptr) { + return __atomic_load_n(ptr, __ATOMIC_RELAXED); +} +static void atomicStoreRelaxed(int* ptr, int val) { + __atomic_store_n(ptr, val, __ATOMIC_RELAXED); +} +#endif // defined(ARK_ROCM) + +namespace ark { + +class GpuLoopKernel::Impl { + public: + Impl(int gpu_id, const std::string& plan, const std::string& name, + size_t smem_bytes); + ~Impl() = default; + + protected: + friend class GpuLoopKernel; + + std::shared_ptr codegen_; + std::shared_ptr timer_begin_; + std::shared_ptr timer_end_; + std::shared_ptr buffer_; + std::shared_ptr flag_; + std::shared_ptr stream_; +}; + +GpuLoopKernel::Impl::Impl(int gpu_id, const std::string& plan, + const std::string& name, + [[maybe_unused]] size_t smem_bytes) { + auto gpu_manager = GpuManager::get_instance(gpu_id); + codegen_ = std::make_shared(plan, name); + timer_begin_ = gpu_manager->create_event(); + timer_end_ = gpu_manager->create_event(); + buffer_ = gpu_manager->malloc(codegen_->total_memory_bytes()); + flag_ = gpu_manager->malloc_host( + sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined); + stream_ = gpu_manager->create_stream(); +} + +GpuLoopKernel::GpuLoopKernel(int gpu_id, const std::string& plan, + const std::string& name, size_t smem_bytes) + : GpuKernel(), + impl_(std::make_shared(gpu_id, plan, name, smem_bytes)) { + auto gpu_manager = GpuManager::get_instance(gpu_id); + int threads_per_block = + static_cast(impl_->codegen_->num_warps_per_proc() * + gpu_manager->info().threads_per_warp); + int num_sm = static_cast(impl_->codegen_->num_procs()); + int *flag = impl_->flag_->ref(); + this->init(gpu_id, impl_->codegen_->code(), {threads_per_block, 1, 1}, + {num_sm, 1, 1}, (smem_bytes < 4) ? 4 : smem_bytes, name, + {{flag, sizeof(flag)}}); +} + +void GpuLoopKernel::load() { + if (!is_compiled()) { + ERR(InvalidUsageError, "Need to compile first before initialization."); + } + if (is_launched_) { + // Wait until previous works finish. + wait(); + return; + } + // Initialize global variables in the loop kernel. + void* buf_ptr_val = impl_->buffer_->ref(); + GpuPtr lss_ptr_addr; + GpuPtr buf_ptr_addr; + size_t tmp = 0; + GLOG_DRV(gpuModuleGetGlobal(&lss_ptr_addr, &tmp, module_, ARK_LSS_NAME)); + GLOG_DRV(gpuModuleGetGlobal(&buf_ptr_addr, &tmp, module_, ARK_BUF_NAME)); + std::array data = {0, 0, 0, 0}; + gpu_manager_->memcpy_htod((void*)lss_ptr_addr, 0, data.data(), 0, + sizeof(int) * data.size()); + gpu_manager_->memcpy_htod((void*)buf_ptr_addr, 0, &buf_ptr_val, 0, + sizeof(GpuPtr)); + // TODO: remove this hack + GpuPtr lss_0_ptr_addr; + GpuPtr lss_1_ptr_addr; + gpuDrvError ret = + gpuModuleGetGlobal(&lss_0_ptr_addr, &tmp, module_, ARK_LSS_NAME "_0"); + if (ret == gpuDrvSuccess) { + gpu_manager_->memcpy_htod((void*)lss_0_ptr_addr, 0, data.data(), 0, + sizeof(int) * data.size()); + } else if (ret != gpuErrorNotFound) { + GLOG_DRV(ret); + } + ret = gpuModuleGetGlobal(&lss_1_ptr_addr, &tmp, module_, ARK_LSS_NAME "_1"); + if (ret == gpuDrvSuccess) { + gpu_manager_->memcpy_htod((void*)lss_1_ptr_addr, 0, data.data(), 0, + sizeof(int) * data.size()); + } else if (ret != gpuErrorNotFound) { + GLOG_DRV(ret); + } + // set the data buffer pointers of remote gpus + // int nrph = get_env().num_ranks_per_host; + // int nodes_id = gpu_manager_->get_gpu_id() / nrph; + // // only set the GPU remote data buf pointers of the GPUs on the same node + // for (int i = nodes_id * nrph; + // i < (nodes_id + 1) * nrph && i < ctx_->world_size(); i++) { + // void* data_buf_value = ctx_->get_data_memory(i)->ref(); + // if (data_buf_value == 0) { + // continue; + // } + // GpuPtr data_buf_ptr; + // std::string data_buf_name = ARK_BUF_NAME + std::to_string(i); + // gpuDrvError _e = gpuModuleGetGlobal(&data_buf_ptr, &tmp, module_, + // data_buf_name.c_str()); + // if (_e == gpuErrorNotFound) { + // LOG(DEBUG, "global variable ", data_buf_name, " not found"); + // continue; + // } + // LOG(DEBUG, data_buf_name, " data_buf_ptr=", std::hex, data_buf_ptr, + // " data_buf_value=", data_buf_value); + // gpu_manager_->memcpy_htod((void*)data_buf_ptr, 0, &data_buf_value, 0, + // sizeof(GpuPtr)); + // } + + // std::shared_ptr comm = ctx_->get_comm_sw(); + // if (comm->get_proxy_channels_num() > 0) { + // GpuPtr channel_addr; + // GLOG_DRV(gpuModuleGetGlobal(&channel_addr, &tmp, module_, + // "_ARK_PROXY_CHANS")); + // const void* chans_ref = comm->get_proxy_channels_ref(); + // size_t chans_bytes = comm->get_proxy_channels_bytes(); + // gpu_manager_->memcpy_htod((void*)channel_addr, 0, + // const_cast(chans_ref), 0, chans_bytes); + // } + // if (comm->get_sm_channels_num() > 0) { + // GpuPtr channel_addr; + // GLOG_DRV( + // gpuModuleGetGlobal(&channel_addr, &tmp, module_, + // "_ARK_SM_CHANS")); + // const void* chans_ref = comm->get_sm_channels_ref(); + // size_t chans_bytes = comm->get_sm_channels_bytes(); + // gpu_manager_->memcpy_htod((void*)channel_addr, 0, + // const_cast(chans_ref), 0, chans_bytes); + // } +} + +void GpuLoopKernel::launch(bool disable_timing) { + elapsed_msec_ = -1; + if (!is_compiled()) { + ERR(InvalidUsageError, "Need to compile first before initialization."); + } else if (is_launched_) { + LOG(WARN, "Ignore launching twice."); + return; + } + if (!disable_timing) { + impl_->timer_begin_->record(impl_->stream_); + } + + // ctx_->get_comm_sw()->launch_request_loop(); + + // Initialize loop flags. + atomicStoreRelaxed(impl_->flag_->ref(), 0); + GpuKernel::launch(impl_->stream_); + if (!disable_timing) { + impl_->timer_end_->record(impl_->stream_); + is_recording_ = true; + } + is_launched_ = true; +} + +void GpuLoopKernel::run(int iter) { + if (iter > 0) { + while (atomicLoadRelaxed(impl_->flag_->ref()) > 0) { + } + atomicStoreRelaxed(impl_->flag_->ref(), iter); + } +} + +bool GpuLoopKernel::poll() { + return atomicLoadRelaxed(impl_->flag_->ref()) <= 0; +} + +void GpuLoopKernel::wait() { + int cnt = MAX_LOOP_COUNTER; + while (atomicLoadRelaxed(impl_->flag_->ref()) > 0) { + if (--cnt > 0) { + continue; + } + // Check if the kernel encountered an error. + gpuError res = impl_->stream_->query(); + if (res == gpuSuccess) { + if (atomicLoadRelaxed(impl_->flag_->ref()) > 0) { + LOG(WARN, "Stream is finished but the loop flag is still set."); + break; + } else { + LOG(WARN, + "wait() is delayed by a stream query. Regarding " + "timing measurements may be inaccurate."); + break; + } + } else if (res == gpuErrorNotReady) { + cnt = MAX_LOOP_COUNTER; + } else { + GLOG(res); + } + } +} + +void GpuLoopKernel::stop() { + wait(); + atomicStoreRelaxed(impl_->flag_->ref(), -1); + impl_->stream_->sync(); + if (is_recording_) { + elapsed_msec_ = impl_->timer_end_->elapsed_msec(*impl_->timer_begin_); + is_recording_ = false; + } + is_launched_ = false; + // ctx_->get_comm_sw()->stop_request_loop(); +} + +float GpuLoopKernel::get_elapsed_msec() const { + if (is_recording_) { + ERR(InvalidUsageError, "Need to stop the kernel first."); + } + return elapsed_msec_; +} + +} // namespace ark diff --git a/ark/gpu/gpu_loop_kernel.h b/ark/gpu/gpu_loop_kernel.h index 514dfa746..ec9beb626 100644 --- a/ark/gpu/gpu_loop_kernel.h +++ b/ark/gpu/gpu_loop_kernel.h @@ -15,11 +15,10 @@ namespace ark { class GpuLoopKernel : public GpuKernel { public: - GpuLoopKernel(std::shared_ptr ctx, const std::string &name, - const std::vector &codes, int num_sm, - int num_warp, unsigned int smem_bytes); + GpuLoopKernel(int gpu_id, const std::string &plan, const std::string &name, + size_t smem_bytes); - void launch(std::shared_ptr stream, bool disable_timing = true); + void launch(bool disable_timing = true); void load(); void run(int iter = 1); bool poll(); @@ -29,13 +28,10 @@ class GpuLoopKernel : public GpuKernel { float get_elapsed_msec() const; private: - std::shared_ptr timer_begin_; - std::shared_ptr timer_end_; + class Impl; + std::shared_ptr impl_; - int threads_per_warp_ = -1; - std::shared_ptr flag_ = nullptr; - - std::shared_ptr stream_ = nullptr; + bool is_launched_ = false; bool is_recording_ = false; float elapsed_msec_ = -1; }; diff --git a/ark/gpu/gpu_manager.cc b/ark/gpu/gpu_manager.cpp similarity index 100% rename from ark/gpu/gpu_manager.cc rename to ark/gpu/gpu_manager.cpp diff --git a/ark/gpu/gpu_memory.cc b/ark/gpu/gpu_memory.cpp similarity index 100% rename from ark/gpu/gpu_memory.cc rename to ark/gpu/gpu_memory.cpp diff --git a/ark/gpu/gpu_stream.cc b/ark/gpu/gpu_stream.cpp similarity index 100% rename from ark/gpu/gpu_stream.cc rename to ark/gpu/gpu_stream.cpp diff --git a/ark/half.cc b/ark/half.cpp similarity index 100% rename from ark/half.cc rename to ark/half.cpp diff --git a/ark/half.h b/ark/half.h index 6be1a29b1..1820bb3d3 100644 --- a/ark/half.h +++ b/ark/half.h @@ -234,6 +234,8 @@ struct alignas(2) half_t { int mantissa() const { return int(storage & 0x3ff); } }; +using fp16 = half_t; + /// Assignment from half_t template <> half_t& half_t::operator=(half_t const& x); diff --git a/ark/half_test.cc b/ark/half_test.cpp similarity index 99% rename from ark/half_test.cc rename to ark/half_test.cpp index a94c21759..2a73a942d 100644 --- a/ark/half_test.cc +++ b/ark/half_test.cpp @@ -3,7 +3,6 @@ #include "half.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_half() { @@ -245,7 +244,6 @@ ark::unittest::State test_half_error() { } int main() { - ark::init(); UNITTEST(test_half); UNITTEST(test_half_error); return 0; diff --git a/ark/include/ark.h b/ark/include/ark.hpp similarity index 85% rename from ark/include/ark.h rename to ark/include/ark.hpp index 9b77122e4..cc096c457 100644 --- a/ark/include/ark.h +++ b/ark/include/ark.hpp @@ -11,10 +11,10 @@ #define ARK_PATCH 0 #define ARK_VERSION (ARK_MAJOR * 10000 + ARK_MINOR * 100 + ARK_PATCH) -#include "ark/dims.h" -#include "ark/error.h" -#include "ark/executor.h" -#include "ark/model.h" +#include "ark/dims.hpp" +#include "ark/error.hpp" +// #include "ark/executor.hpp" +#include "ark/model.hpp" namespace ark { diff --git a/ark/include/ark/dims.h b/ark/include/ark/dims.hpp similarity index 76% rename from ark/include/ark/dims.h rename to ark/include/ark/dims.hpp index c15d7537e..ffda8a649 100644 --- a/ark/include/ark/dims.h +++ b/ark/include/ark/dims.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_DIMS_H -#define ARK_DIMS_H +#ifndef ARK_DIMS_HPP +#define ARK_DIMS_HPP #include #include @@ -11,16 +11,17 @@ namespace ark { // Data type for dimension. -typedef long long int DimType; +typedef int64_t DimType; // DIMS_LEN is the maximum number of dimensions of a tensor. If a tensor // has less than DIMS_LEN dimensions, the remaining dimensions will be NO_DIM. -enum { DIMS_LEN = 4, NO_DIM = -1 }; +constexpr DimType NO_DIM = -1; +constexpr DimType DIMS_LEN = 4; // Up-to-`DIMS_LEN`-dimensional vector. class Dims { private: - DimType data[DIMS_LEN]; + DimType data_[DIMS_LEN]; public: // Construct with given four dimensions. @@ -39,16 +40,22 @@ class Dims { int ndims() const; // Return a new Dims object with 4 valid dimensions by prepending 1s. Dims dims4() const; + // Return true if all valid dimensions are zero. + bool is_zeros() const; // Return true if the dimensions are empty. bool is_no_dim() const; // Return true if the dimensions are invalid. bool is_invalid() const; + // Return a vector of valid dimensions. + std::vector vector() const; // Insert a dimension at the given index. void insert(int idx, DimType dim); // Erase the dimension at the given index and return the erased dimension. DimType erase(int idx); - std::string serialize() const; + std::string serialize(int indent = -1) const; + + static Dims deserialize(const std::string &serialized); DimType &operator[](int idx); @@ -58,10 +65,10 @@ class Dims { friend bool operator==(const Dims &a, const Dims &b); friend bool operator!=(const Dims &a, const Dims &b); - - friend std::ostream &operator<<(std::ostream &os, const Dims &dims); }; +std::ostream &operator<<(std::ostream &os, const Dims &dims); + } // namespace ark -#endif // ARK_DIMS_H +#endif // ARK_DIMS_HPP diff --git a/ark/include/ark/error.h b/ark/include/ark/error.h deleted file mode 100644 index 2326c47f4..000000000 --- a/ark/include/ark/error.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef ARK_ERROR_H -#define ARK_ERROR_H - -#include -#include - -namespace ark { - -class InvalidUsageError : public std::runtime_error { - public: - InvalidUsageError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class ModelError : public std::runtime_error { - public: - ModelError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class SchedulerError : public std::runtime_error { - public: - SchedulerError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class ExecutorError : public std::runtime_error { - public: - ExecutorError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class SystemError : public std::runtime_error { - public: - SystemError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class GpuError : public std::runtime_error { - public: - GpuError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class RuntimeError : public std::runtime_error { - public: - RuntimeError(const std::string &msg) : std::runtime_error(msg) {} -}; - -class UnitTestError : public std::runtime_error { - public: - UnitTestError(const std::string &msg) : std::runtime_error(msg) {} -}; - -} // namespace ark - -#endif // ARK_ERROR_H diff --git a/ark/include/ark/executor.h b/ark/include/ark/executor.hpp similarity index 76% rename from ark/include/ark/executor.h rename to ark/include/ark/executor.hpp index 4bfa3a9dc..e0419ad83 100644 --- a/ark/include/ark/executor.h +++ b/ark/include/ark/executor.hpp @@ -1,10 +1,11 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_EXECUTOR_H -#define ARK_EXECUTOR_H +#ifndef ARK_EXECUTOR_HPP +#define ARK_EXECUTOR_HPP -#include "model.h" +#include +#include namespace ark { @@ -12,18 +13,24 @@ namespace ark { class Executor { public: /// Constructor. - Executor(int rank, int world_size, Model &model, const std::string &name, - int num_warps_per_sm = 16); + Executor(int rank, int world_size, const std::string &plan, + const std::string &name = "DefaultExecutor"); + ~Executor(); + /// Compile the model. This must be called before `launch()`. void compile(); + /// Launch the model (not running yet). This must be called after /// `compile()`. void launch(); + /// Run the model for `iter` iterations. void run(int iter); + /// Wait for the previous run to finish. void wait(); + /// Stop the model and return the elapsed time in milliseconds. /// Once this is called, we need to call `launch()` again to run the model /// again. @@ -36,4 +43,4 @@ class Executor { } // namespace ark -#endif // ARK_EXECUTOR_H +#endif // ARK_EXECUTOR_HPP diff --git a/ark/include/ark/init.hpp b/ark/include/ark/init.hpp new file mode 100644 index 000000000..00382f747 --- /dev/null +++ b/ark/include/ark/init.hpp @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_INIT_HPP +#define ARK_INIT_HPP + +namespace ark { + +/// Initialize the ARK runtime. +/// +/// This function should be called by the user before any other functions are +/// called. It is safe to call this function multiple times. +void init(); + +} // namespace ark + +#endif // ARK_INIT_HPP diff --git a/ark/include/ark/model.h b/ark/include/ark/model.h deleted file mode 100644 index a3bbc08cd..000000000 --- a/ark/include/ark/model.h +++ /dev/null @@ -1,1125 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef ARK_MODEL_H -#define ARK_MODEL_H - -#include -#include -#include -#include -#include - -#include "dims.h" - -namespace ark { - -class Tensor; -class CodeGenerator; -class BaseScheduler; -class SchedOp; -class Model; - -/// Type of tensor data. -class TensorType { - private: - const std::string name_; - const int bytes_; - const std::string type_str_; - - public: - TensorType(const std::string &name = "none", int bytes = 0, - const std::string &type_str = "void *"); - - bool operator==(const TensorType &other) const; - bool operator!=(const TensorType &other) const; - - int bytes() const; - const std::string &name() const; - const std::string &type_str() const; -}; - -const TensorType NONE; - -std::ostream &operator<<(std::ostream &os, const TensorType &type); - -#define REGISTER_TENSOR_TYPE(_type_name, _bytes, _type_str) \ - class TensorType_##_type_name : public TensorType { \ - public: \ - TensorType_##_type_name() \ - : TensorType{#_type_name, _bytes, _type_str} {} \ - }; \ - const TensorType_##_type_name _type_name; - -REGISTER_TENSOR_TYPE(FP32, 4, "fp32") -REGISTER_TENSOR_TYPE(FP16, 2, "fp16") -REGISTER_TENSOR_TYPE(BF16, 2, "bf16") -REGISTER_TENSOR_TYPE(INT32, 4, "i32") -REGISTER_TENSOR_TYPE(UINT32, 4, "ui32") -REGISTER_TENSOR_TYPE(INT8, 1, "i8") -REGISTER_TENSOR_TYPE(UINT8, 1, "ui8") -REGISTER_TENSOR_TYPE(BYTE, 1, "unsigned char") - -class GpuBuffer; -// TensorBuf refers to a data array that can be shared by multiple tensors. -class TensorBuf { - public: - TensorBuf(const DimType &bytes = 0, int id = -1); - TensorBuf(const TensorBuf &) = default; - - size_t get_buf_offset() const; - - DimType bytes; - int id; - bool immutable = true; - - protected: - std::shared_ptr buf = nullptr; - - friend class Tensor; - friend class DefaultScheduler; -}; - -/// Tensor is a view of a TensorBuf. -/// -/// Illustration of a single axis of a tensor: -/// -/// 0 off ldim -/// |------------|-------------shape-------------|---------------------------| -/// ^ <-----------------------------> ^ -/// | data range of this tensor | -/// +------------------------------------------+-----------+ -/// | -/// We call these "padding". -/// -class Tensor { - public: - /// Tensor constructor. - Tensor(const Dims &shape, const TensorType &type, TensorBuf *buf, - const Dims &ldims, const Dims &offs, const Dims &pads, bool exported, - int imported_rank, int id, const std::string &name); - Tensor(const Tensor &) = default; - - /// Copy contiguous data from a host buffer to the given tensor's (possibly - /// non-contiguous) data range. - /// - /// For example, say the tensor is a 2D float tensor with shape [2, 3], - /// ldims [2, 4], offs [0, 0], and pads [1, 1], then the data in the host - /// buffer is 0, 1, ..., 5. After writing, the data in the tensor will be: - /// - /// [[0, 1, 2, ?], - /// [3, 4, 5, ?]] - /// - /// where ? means the original unmodified value. - /// - /// @param buf The host buffer to copy from. The buffer must be large enough - /// to hold the data. - /// - void write(const void *buf); - - /// Copy (possibly non-contiguous) data from a tensor on GPU to a contiguous - /// host buffer. - /// - /// The given number of bytes is copied, in order of appearance on the - /// memory. This function assumes that @p buf is large enough to hold the - /// data. For example, say the tensor is a 2D float tensor with shape [2, - /// 3], ldims [2, 4], offs [0, 0], and pads [1, 1], then the data in the - /// tensor is: - /// - /// [[0, 1, 2, 3], - /// [4, 5, 6, 7]] - /// - /// After read, the data in the host buffer will be 0, 1, 2, 4, 5, 6. - /// - /// @param buf The host buffer to copy to. The buffer must be large enough - /// to hold the data. If @p buf is nullptr, a new buffer will be allocated. - /// @return The host buffer that holds the data. - /// - void *read(void *buf = nullptr); - - /// Copy all the underlying buffer data (including padding) to a contiguous - /// host buffer. - /// - /// This function is mainly for debugging purposes. - /// - /// @param buf The host buffer to copy to. The buffer must be large enough - /// to hold the data. If @p buf is nullptr, a new buffer will be allocated. - /// @return The host buffer that holds the data. - /// - void *read_raw(void *buf = nullptr); - - /// Set all bytes of the tensor buffer to 0. - void clear(); - - /// Offset to the element [i0][i1][i2][i3] of this tensor in the TensorBuf. - /// @param i0, i1, i2, i3 The indices of the element. - /// @return The offset in the number of elements. - DimType offset(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0, - DimType i3 = 0) const; - - /// Number of elements in the tensor excluding padding. - /// @return The number of elements. - DimType size() const; - - /// Number of dimensions of the tensor. - /// @return The number of dimensions. - int ndims() const; - - /// Number of bytes of each element in the tensor. - /// @return The number of bytes. - int type_bytes() const; - - /// Number of bytes in the tensor's data range. - /// @return The number of bytes. - DimType shape_bytes() const; - - /// Equivalent as the number of bytes of the underlying @ref TensorBuf. - /// @return The number of bytes. - DimType ldims_bytes() const; - - /// Offset in bytes. - /// @param i0, i1, i2, i3 The indices of the element. - /// @return The offset in bytes. - DimType offset_bytes(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0, - DimType i3 = 0) const; - - /// Checks if the tensor has the actually memory allocated. - /// @return True if the tensor has the memory allocated. - bool is_alloced() const; - - /// Checks if the tensor's data range is sequential in memory. - /// @return True if the tensor is sequential in memory. - bool is_sequential() const; - - /// TensorBuf that this tensor is associated with - TensorBuf *buf; - /// Data type of each element in the tensor - TensorType type; - /// Shape of the tensor - Dims shape; - /// Leading dimensions of the underlying data array - Dims ldims; - /// Offset of the tensor in the underlying data array - Dims offs; - /// Unit dimensions of the underlying data array. ldims[x] should be always - /// divided by pads[x]. - Dims pads; - /// Whether this tensor is local and accessed by remote devices. - bool exported; - /// If `imported_rank` is non-negative, the tensor is imported from another - /// rank and don't need to allocate a TensorBuf for it. - int imported_rank; - /// Unique id of this tensor - int id; - /// Name of this tensor - const std::string name; - - protected: - bool update_pads(const Dims &tile, const Tensor *ref_tensor = nullptr, - const Dims &ref_orig_ldims = {}); - - friend class DefaultScheduler; - friend class SchedOp; -}; - -/// Type of operator argument. -struct OpArgType { - OpArgType(size_t id, const std::string &name) : id(id), name(name) {} - size_t id; - std::string name; -}; - -bool operator==(const OpArgType &lhs, const OpArgType &rhs); - -bool operator!=(const OpArgType &lhs, const OpArgType &rhs); - -std::ostream &operator<<(std::ostream &os, const OpArgType &type); - -const OpArgType OP_ARG_INT(0, "int"); -const OpArgType OP_ARG_INT64(1, "int64"); -const OpArgType OP_ARG_UINT64(2, "uint64"); -const OpArgType OP_ARG_BOOL(3, "bool"); -const OpArgType OP_ARG_FLOAT(4, "float"); -const OpArgType OP_ARG_DIMS(5, "dims"); -const OpArgType OP_ARG_TENSOR(6, "tensor"); - -/// Stores an arbitrary type of argument given to an operator. -struct OpArg { - OpArg(int arg); - OpArg(long long int arg); - OpArg(uint64_t arg); - OpArg(bool arg); - OpArg(float arg); - OpArg(const Dims &arg); - OpArg(Tensor *arg); - OpArg(const OpArg &); - ~OpArg(); - - void get(int *arg) const; - void get(long long int *arg) const; - void get(uint64_t *arg) const; - void get(bool *arg) const; - void get(float *arg) const; - void get(Dims *arg) const; - void get(Tensor **arg) const; - - OpArgType type; - void *val; - - friend bool operator<(const OpArg &oa1, const OpArg &oa2); - friend bool operator==(const OpArg &oa1, const OpArg &oa2); -}; - -/// Stores a list of @ref OpArg. -class OpArgs { - public: - OpArgs(const std::vector &args = {}); - OpArgs(const OpArgs &) = default; - ~OpArgs(){}; - - OpArgs &operator=(const OpArgs &opargs); - - void put(const OpArg &arg); - - void get(int *arg, size_t idx) const; - void get(long long int *arg, size_t idx) const; - void get(uint64_t *arg, size_t idx) const; - void get(bool *arg, size_t idx) const; - void get(float *arg, size_t idx) const; - void get(Dims *arg, size_t idx) const; - void get(Tensor **arg, size_t idx) const; - - const std::vector &get_args() const; - - protected: - std::vector args; - - friend class Op; - friend bool operator<(const OpArgs &opargs1, const OpArgs &opargs2); - friend bool operator==(const OpArgs &opargs1, const OpArgs &opargs2); - friend bool operator!=(const OpArgs &opargs1, const OpArgs &opargs2); -}; - -/// Type of @ref Op. -struct OpType { - OpType(size_t id, const std::string &name) : id(id), name(name) {} - const size_t id; - std::string name; -}; - -bool operator==(const OpType &lhs, const OpType &rhs); - -const OpType OP_UNKNOWN(0, "unknown"); -const OpType OP_TENSOR(1, "tensor"); -const OpType OP_REFER(2, "refer"); -const OpType OP_RESHAPE(3, "reshape"); -const OpType OP_MERGE(4, "merge"); -const OpType OP_REDUCE_E_SUM(5, "reduce_e_sum"); -const OpType OP_REDUCE_E_MEAN(6, "reduce_e_mean"); -const OpType OP_REDUCE_E_MAX(7, "reduce_e_max"); -const OpType OP_REDUCE_W_SUM(8, "reduce_w_sum"); -const OpType OP_REDUCE_W_MEAN(9, "reduce_w_mean"); -const OpType OP_REDUCE_W_MAX(10, "reduce_w_max"); -const OpType OP_LAYERNORM(11, "layernorm"); -const OpType OP_SCALE(12, "scale"); -const OpType OP_RELU(13, "relu"); -const OpType OP_COPY(14, "copy"); -const OpType OP_GELU(15, "gelu"); -const OpType OP_SIGMOID(16, "sigmoid"); -const OpType OP_EXP(17, "exp"); -const OpType OP_SQRT(18, "sqrt"); -const OpType OP_RSQRT(19, "rsqrt"); -const OpType OP_MATMUL(20, "matmul"); -const OpType OP_MAX_POOL(21, "max_pool"); -const OpType OP_ADD(22, "add"); -const OpType OP_SUB(23, "sub"); -const OpType OP_MUL(24, "mul"); -const OpType OP_DIV(25, "div"); -const OpType OP_ROPE(26, "rope"); -const OpType OP_IM2COL(27, "im2col"); -const OpType OP_TRANSPOSE(28, "transpose"); -const OpType OP_SEND(29, "send"); -const OpType OP_SEND_DONE(30, "send_done"); -const OpType OP_RECV(31, "recv"); -const OpType OP_EMBEDDING(32, "embedding"); -const OpType OP_DEVICE_SYNC(33, "device_sync"); -const OpType OP_READ_AND_REDUCE(34, "read_and_reduce"); -const OpType OP_GATHER_FROM_PEERS(35, "gather_from_peers"); -const OpType OP_CAST(36, "cast"); -const OpType OP_PUT_PACKET(37, "put_packet"); -const OpType OP_REDUCE_AND_WRITE_PACKET(38, "reduce_and_write_packet"); -const OpType OP_GET_FROM_PACKET(39, "get_from_packet"); - -/// Type of hardware architecture support. -typedef enum { - OP_ARCH_UNKNOWN = 0, - OP_ARCH_CUDA_60 = 0x1, - OP_ARCH_CUDA_70 = 0x2, - OP_ARCH_CUDA_80 = 0x4, - OP_ARCH_CUDA_90 = 0x8, - OP_ARCH_CUDA_ANY = 0x0f, - OP_ARCH_ROCM_90A = 0x10, - OP_ARCH_ROCM_942 = 0x20, - OP_ARCH_ROCM_ANY = 0xf0, - OP_ARCH_ANY = -1, -} OpArchType; - -OpArchType op_arch_from_string(const std::string &arch); - -/// 2-dimensional op tile -struct OpTile { - DimType x; - DimType y; -}; - -/// Configurations for execution of a @ref Op. -struct OpConfig { - int num_warps = 0; - int smem_bytes = 0; - std::vector input_tiles; - std::vector output_tiles; - bool sync_pre = false; - bool sync_post = false; -}; - -/// Key to find a list of OpConfigs from OpConfigMap. -struct OpConfigKey { - OpArchType arch_type; - std::string prec_type; -}; - -bool operator<(const OpConfigKey &ops1, const OpConfigKey &ops2); - -bool operator==(const OpConfigKey &ops1, const OpConfigKey &ops2); - -/// Map from OpConfigKey to a list of OpConfigs. -class OpConfigMap { - public: - OpConfigMap(std::initializer_list< - std::pair>> - ilist); - ~OpConfigMap(){}; - - const std::vector &get(const OpConfigKey &key) const; - - private: - const std::map> cfg_map; -}; - -/// Operator. -class Op { - public: - /// Construct an operator. - Op() = default; - - /// Construct an operator. - /// @param type the type of the @ref Op. - /// @param prec_type the precision type of the @ref Op. - /// @param inputs the input tensors of the @ref Op, including execution - /// dependencies. - /// @param output_refs the output reference tensors of the @ref Op. Output - /// tensors are created based on these references. - /// @param args the arguments of the @ref Op. - /// @param name the name of the @ref Op. - /// @param cfg_map the configuration map of the @ref Op - /// @param gran_lev the granularity level of the @ref Op. Larger values - /// should indicate finer-grained Ops. If it is -1, the granularity level - /// will be automatically determined by the scheduler. - /// @param force_inline whether to force inline the kernel of @ref Op. - Op(const OpType &type, const std::string &prec_type, - const std::vector &inputs, - const std::vector &output_refs, const OpArgs &args, - const std::string &name, const OpConfigMap *cfg_map = nullptr, - int gran_lev = -1, bool force_inline = false); - - /// Construct an operator. - Op(const Op &) = default; - - /// Destruct the operator. - ~Op(){}; - - /// Return the kernel function name of the operator. Includes the template - /// arguments of the kernel, if any. - /// @param cfg the configuration of the operator. - /// @return the kernel function name of the operator. - std::string function_name(const OpConfig &) const; - - /// Return the kernel function's runtime arguments of the operator. - /// @param cfg the configuration of the operator. - /// @return the runtime arguments of the kernel function. - OpArgs function_call_args(const OpConfig &) const; - - /// Returns true if the operator is virtual (i.e., performs no computation). - bool is_virtual() const; - - /// Returns true if the operator is a communication operator. - bool is_comm() const; - - /// Type of the operator. - OpType type; - /// Precision type of the operator. - std::string prec_type; - /// The input tensors of the operator. - std::vector inputs; - /// The output tensors of the operator. - std::vector outputs; - /// The reference tensors of the output tensors. - std::vector output_refs; - /// Additional arguments of the operator. - OpArgs args; - /// Name of the operator. - std::string name; - /// Map from OpConfigKey to a list of OpConfigs. - const OpConfigMap *cfg_map; - /// Granularity level of the operator. - int gran_lev; - /// Force inlining of the operator kernel. - bool force_inline; - - friend bool operator<(const Op &op1, const Op &op2); - friend bool operator==(const Op &op1, const Op &op2); - - protected: - static std::string function_name(const std::string &kernel_name, - const OpArgs &template_args); -}; - -/// List all operator classes below. - -class ArithmeticOp : public Op { - public: - ArithmeticOp(const OpType &type, const std::string &prec_type, - Tensor *input, Tensor *other, Tensor *output, - const std::string &name); - - protected: - std::string function_name(const OpConfig &cfg, - const std::string &type) const; -}; - -class AddOp : public ArithmeticOp { - public: - AddOp(const std::string &prec_type, Tensor *input, Tensor *other, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class SubOp : public ArithmeticOp { - public: - SubOp(const std::string &prec_type, Tensor *input, Tensor *other, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class MulOp : public ArithmeticOp { - public: - MulOp(const std::string &prec_type, Tensor *input, Tensor *other, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class DivOp : public ArithmeticOp { - public: - DivOp(const std::string &prec_type, Tensor *input, Tensor *other, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class MathOp : public Op { - public: - MathOp(const OpType &type, const std::string &prec_type, Tensor *input, - Tensor *output, const std::string &name); - - protected: - std::string function_name(const OpConfig &cfg, - const std::string &type) const; -}; - -class GeluOp : public MathOp { - public: - GeluOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ExpOp : public MathOp { - public: - ExpOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReluOp : public MathOp { - public: - ReluOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class RsqrtOp : public MathOp { - public: - RsqrtOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class SigmoidOp : public MathOp { - public: - SigmoidOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class SqrtOp : public MathOp { - public: - SqrtOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class RopeOp : public Op { - public: - RopeOp(const std::string &prec_type, Tensor *input, Tensor *other, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class Im2colOp : public Op { - public: - Im2colOp(const std::string &prec_type, Tensor *input, Tensor *output, - int kernel_height, int kernel_width, int stride_height, - int stride_width, int pad_height, int pad_width, - int dilation_height, int dilation_width, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class LayernormOp : public Op { - public: - LayernormOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class MatmulOp : public Op { - public: - MatmulOp(const std::string &prec_type, Tensor *mat_a, Tensor *mat_b, - Tensor *mat_y, Dims nca, Dims ncb, Dims problem_size, - Dims leading_dims, bool is_column_a, bool is_column_b, - const std::string &name, int gran_lev); - std::string function_name(const OpConfig &cfg) const; -}; - -class MaxPoolOp : public Op { - public: - MaxPoolOp(const std::string &prec_type, Tensor *input, Tensor *output, - DimType kernel_size, DimType stride, const std::string &name); -}; - -class ReduceOp : public Op { - public: - ReduceOp(const OpType &type, const std::string &prec_type, - const std::vector &inputs, - const std::vector &outputs, const OpArgs &args, - const std::string &name, const OpConfigMap *cfg_map, int gran_lev); - - protected: - std::string function_name(const OpConfig &cfg, - const std::string &type) const; -}; - -class ReduceWSumOp : public ReduceOp { - public: - ReduceWSumOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReduceESumOp : public ReduceOp { - public: - ReduceESumOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReduceWMaxOp : public ReduceOp { - public: - ReduceWMaxOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReduceEMaxOp : public ReduceOp { - public: - ReduceEMaxOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReduceWMeanOp : public ReduceOp { - public: - ReduceWMeanOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReduceEMeanOp : public ReduceOp { - public: - ReduceEMeanOp(const std::string &prec_type, Tensor *input, Tensor *output, - int axis, bool keepdims, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class CopyOp : public Op { - public: - CopyOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class ReshapeOp : public Op { - public: - ReshapeOp(const std::string &prec_type, Tensor *input, Tensor *output, - const std::string &name); -}; - -class ScaleOp : public Op { - public: - ScaleOp(const std::string &prec_type, Tensor *input, Tensor *output, - float val, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &) const; -}; - -class SendOp : public Op { - public: - SendOp(const std::string &prec_type, Tensor *input, Tensor *recvbuf, - int sid, int rank, int dst_rank, size_t bytes, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; - // The args determined by the scheduler. - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class RecvOp : public Op { - public: - RecvOp(const std::string &prec_type, Tensor *output, int sid, int rank, - int src_rank, size_t bytes, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class SendDoneOp : public Op { - public: - SendDoneOp(const std::string &prec_type, Tensor *input, int rank, - int dst_rank, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class DeviceSyncOp : public Op { - public: - DeviceSyncOp(const std::string &prec_type, Tensor *input, Tensor *output, - int nranks, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class ReadAndReduceOp : public Op { - public: - ReadAndReduceOp(const std::string &prec_type, Tensor *local_buf, - Tensor *cal_region_local, std::vector remote_bufs, - int sid, int rank, int npeers, size_t offset, size_t bytes, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class GatherFromPeersOp : public Op { - public: - GatherFromPeersOp(const std::string &prec_type, Tensor *local_buf, - Tensor *trans_region_local, - std::vector remote_bufs, int sid, int rank, - int npeers, size_t stride, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class PutPacketOp : public Op { - public: - PutPacketOp(const std::string &prec_type, Tensor *input, - Tensor *local_tmp_buf, Tensor *recv_buf, int id, int rank, - int dst_rank, size_t dst_offset, int flag, - const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class ReduceAndWritePacketOp : public Op { - public: - ReduceAndWritePacketOp(const std::string &prec_type, - std::vector inputs, Tensor *output, int id, - int rank, int npeers, size_t elems_per_rank, - size_t scratch_offset, size_t remote_dst_offset, - int flag, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class GetFromPacketOp : public Op { - public: - GetFromPacketOp(const std::string &prec_type, Tensor *input, Tensor *output, - size_t src_offset, size_t dst_offset, size_t npackets, - int flag, const std::string &name); - std::string function_name(const OpConfig &cfg) const; - OpArgs function_call_args(const OpConfig &cfg) const; -}; - -class TensorOp : public Op { - public: - TensorOp(const std::vector &deps, Tensor *output, - const std::string &name); -}; - -class TransposeOp : public Op { - public: - TransposeOp(const std::string &prec_type, Tensor *input, Tensor *output, - int tp_type, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class EmbeddingOp : public Op { - public: - EmbeddingOp(const std::string &prec_type, Tensor *input, Tensor *weight, - Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -class CastOp : public Op { - public: - CastOp(Tensor *input, Tensor *output, const std::string &name); - std::string function_name(const OpConfig &cfg) const; -}; - -/// A node of @ref Model. -class OpNode { - public: - /// Construct an empty @ref OpNode. - OpNode(){}; - - /// Destruct an @ref OpNode. - ~OpNode(){}; - - /// The list of @ref Op that this @ref OpNode contains. Sorted in the - /// execution order. - std::vector ops; - - /// The list of @ref OpNode that depends on this @ref OpNode. - std::set users; - - /// The list of @ref OpNode that this @ref OpNode depends on. - std::set producers; - - /// Remove this @ref OpNode from the graph. - void remove_self(); - - /// Get the name of this @ref OpNode. - std::string get_name() const; -}; - -class Model { - public: - // Constructors. - Model(int rank_ = 0); - Model(const Model &) = delete; - Model &operator=(const Model &) = delete; - - ~Model(); - - /// Verify if this model is valid. - /// @return true if the model is valid, false otherwise. - bool verify() const; - - void create_nodes(); - void clear_nodes(); - - /// Get the @ref OpNode list. - /// @return The @ref OpNode list. - const std::list> &get_nodes() const; - - /// Break a @ref OpNode into two @ref OpNode. - /// - /// The original node will have the first @p op_idx ops, and the new node - /// will have the rest. - /// - /// @param node The @ref OpNode to break. - /// @param op_idx The index of the first op in the new @ref OpNode. - /// @return The new @ref OpNode. - OpNode *break_node(OpNode *node, int op_idx); - - /// Check dependencies between two @ref OpNode. - /// - /// @param node1 The first @ref OpNode. - /// @param node2 The second @ref OpNode. - /// @return True if @p node1 depends on @p node2. - bool depends_on(OpNode *node1, OpNode *node2) const; - - std::string serialize(int indent = -1) const; - - /// Returns a tensor object. - /// - /// @param shape Shape of the tensor, where the data of interest is. - /// @param ttype Type of the tensor data. - /// @param buf The @ref TensorBuf that holds the entire data including the - /// padding. - /// @param ldims Leading dimensions (ldim) of the tensor, which may be - /// different from the shape. @p ldims can be considered as the actual shape - /// of the underlying data buffer (@ref TensorBuf). - /// @param offs Offsets of the tensor. The data of interest starts at - /// @p offs and ends at @p offs + @p shape. - /// @param pads If a dimension of @p pads is set to larger than 1, the - /// corresponding ldim will be set to the minimum multiple of @p pads that - /// is larger than or equal to the previous ldim. Padding is accumulated - /// across all tensors that share the same @ref TensorBuf. For example, if - /// one tensor sets the last dimension of @p pads to 2, and another tensor - /// sets the last dimension of @p pads to 3, then the corresponding ldim - /// will be the minimum multiple of 2x3=6 that is larger than or equal to - /// the corresponding dimension of @p offs + @p shape. - /// @param exported Whether the tensor is exported to other processes. This - /// should be set to true if the tensor is used as an input or output of a - /// remote process. - /// @param imported_rank The rank of the process that exports the tensor. - /// If @p imported_rank is set to a non-negative value, the tensor will be - /// considered as a remote tensor, hence no memory will be allocated for it - /// on the local. @p imported_rank should be set to -1 if the tensor resides - /// on the local. - /// @param name Name of the tensor. - /// @return Pointer to a tensor object. - /// - Tensor *tensor(const Dims &shape, const TensorType &ttype, - TensorBuf *buf = nullptr, const Dims &ldims = {}, - const Dims &offs = {}, const Dims &pads = {}, - const std::vector &deps = {}, - bool exported = false, int imported_rank = -1, - const std::string &name = "tensor"); - - Tensor *reshape(Tensor *input, const Dims &shape, bool allowzero = false, - Tensor *output = nullptr, - const std::string &name = "reshape"); - Tensor *reshape(Tensor *input, const std::initializer_list &shape, - bool allowzero = false, Tensor *output = nullptr, - const std::string &name = "reshape"); - // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be - // inferred from the `input`. If one dimension of `shape` is 0, by default - // (`allowzero` is false), that dimension is unchanged from the - // corresponding one of `input`. If `allowzero` is true, that dimension is - // set to 0, which means that the reshaped tensor is an empty tensor, i.e., - // `input` should also be an empty tensor. If `allowzero` is true, `shape` - // should not include both 0 and -1 at the same time. If `shape` is an empty - // vector, `input` will be converted to a scalar. - Tensor *reshape(Tensor *input, const std::vector &shape, - bool allowzero = false, Tensor *output = nullptr, - const std::string &name = "reshape"); - // Returns an identical tensor of `input` with execution dependencies - // `deps`. - Tensor *identity(Tensor *input, const std::vector &deps = {}, - const std::string &name = "identity"); - - // Shard `input` along `axis` into `dim_per_shard`-dimensional shards. - std::vector sharding(Tensor *input, DimType axis, - DimType dim_per_shard, - const std::string &name = "sharding"); - // Performs reduction along the `axis` of the `input` tensor and stores the - // result in `output`. - // Currently, only reduction along the last dimension is supported. - template - Tensor *reduce(Tensor *input, int axis, bool keepdims = true, - Tensor *output = nullptr, - const std::string &name = "reduce"); - Tensor *reduce_sum(Tensor *input, int axis, bool keepdims = true, - Tensor *output = nullptr, - const std::string &name = "reduce_sum"); - Tensor *reduce_mean(Tensor *input, int axis, bool keepdims = true, - Tensor *output = nullptr, - const std::string &name = "reduce_mean"); - Tensor *reduce_max(Tensor *input, int axis, bool keepdims = true, - Tensor *output = nullptr, - const std::string &name = "reduce_max"); - // Applies layer normalization to the `input` tensor and returns the - // normalized tensor as `output`. - Tensor *layernorm(Tensor *input, Tensor *output = nullptr, - const std::string &name = "layernorm"); - // Transposes the `input` tensor according to the given `perm` permutation. - // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two - // dimensions of the input tensor. Currently, only 4D tensors are supported. - Tensor *transpose(Tensor *input, Dims perm, Tensor *output = nullptr, - const std::string &name = "transpose"); - // Performs matrix multiplication between the `input` tensor and another - // `other` tensor, storing the result in `output`. - Tensor *matmul(Tensor *input, Tensor *other, Tensor *output = nullptr, - DimType splitk = 1, bool trans_input = false, - bool trans_other = false, const std::string &name = "matmul", - int gran_lev = -1); - // Implements the 'im2col' method for 2D convolution layers, which takes an - // `input` tensor and reshapes it to a 2D matrix by extracting image patches - // from the input tensor based on the provided parameters. - Tensor *im2col(Tensor *input, int kernel_height, int kernel_width, - int stride_height, int stride_width, int pad_height, - int pad_width, int dilation_height, int dilation_width, - Tensor *output = nullptr, - const std::string &name = "im2col"); - // Applies max-pooling on the `input` tensor using `kernel_size` and - // `stride`, reducing its spatial size. The output shape is calculated based - // on the input tensor's shape and the stride value as follows: {is[0], - // (is[1] + stride - 1) / stride, (is[2] + stride - 1) / stride, is[3]}, - // where 'is' represents the input tensor's shape. - Tensor *max_pool(Tensor *input, DimType kernel_size, DimType stride, - Tensor *output = nullptr, - const std::string &name = "max_pool"); - // Multiplies the `input` tensor by a scalar `val`, element-wise. - Tensor *scale(Tensor *input, float val, Tensor *output = nullptr, - const std::string &name = "scale"); - // - template - Tensor *math(Tensor *input, Tensor *output = nullptr, - const std::string &name = "math"); - // Calculates the exponential of the `input` tensor, element-wise. - Tensor *exp(Tensor *input, Tensor *output = nullptr, - const std::string &name = "exp"); - // Calculates the square root of the `input` tensor, element-wise. - Tensor *sqrt(Tensor *input, Tensor *output = nullptr, - const std::string &name = "sqrt"); - // Calculates the reverse square root of the `input` tensor, element-wise. - Tensor *rsqrt(Tensor *input, Tensor *output = nullptr, - const std::string &name = "rsqrt"); - // ReLU activation - Tensor *relu(Tensor *input, Tensor *output = nullptr, - const std::string &name = "relu"); - // Copy the `input` tensor to `output` tensor - Tensor *copy(Tensor *input, Tensor *output = nullptr, - const std::string &name = "copy"); - // Applies the Gaussian Error Linear Unit (GELU) activation function to the - // `input` tensor, element-wise. GELU is a smooth approximation of the - // rectifier function and is widely used in deep learning models. - Tensor *gelu(Tensor *input, Tensor *output = nullptr, - const std::string &name = "gelu"); - // Sigmoid activation - Tensor *sigmoid(Tensor *input, Tensor *output = nullptr, - const std::string &name = "sigmoid"); - // Performs rotary position embedding (RoPE) on the `input` tensor - Tensor *rope(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "rope"); - // Template for broadcated arithmetic operators. - template - Tensor *arithmetic(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "arithmeitc"); - // Performs an element-wise addition operator between the `input` tensor - // and the `other` tensor - Tensor *add(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "add"); - // Performs an element-wise subtraction operator between the `input` tensor - // and the `other` tensor - Tensor *sub(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "sub"); - // Performs an element-wise multiplication operator between the `input` - // tensor and the `other` tensor, - Tensor *mul(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "mul"); - // Performs an element-wise division operator between the `input` - // tensor and the `other` tensor, - Tensor *div(Tensor *input, Tensor *other, Tensor *output = nullptr, - const std::string &name = "div"); - /// Sends a tensor to a destination rank (@p dst_rank). Multiple tensors can - /// be sent to the same rank,so an identifier `id` is required to - /// distinguish the tensor. Each 'send' operator must have a corresponding - /// 'recv' operator that have the same id in another rank's model. - /// - /// @param input - /// @param id - /// @param dst_rank Rank of the GPU to send to. - /// @param bytes - /// @param name - /// @return - Tensor *send(Tensor *input, int sid, int dst_rank, std::size_t bytes = 0, - const std::string &name = "send"); - // Blocks the execution until the corresponding 'send' operator with the - // specified `id` is completed. - Tensor *send_done(Tensor *input, int sid, int dst_rank, - const std::string &name = "send_done"); - // Receives a tensor from a source rank (@p src_rank), identified by the - // `id` parameter. Blocks the execution until the corresponding 'recv' - // operator is completed. - Tensor *recv(int sid, int src_rank, std::size_t bytes = 0, - Tensor *output = nullptr, const std::string &name = "recv"); - // - Tensor *put_packet(Tensor *input, Tensor *local_tmp_buf, Tensor *recv_buf, - int id, int rank, int dst_rank, size_t dst_offset, - int flag, const std::string &name = "put_packet"); - // Performs an all-reduce operator across all ranks, aggregating the input - // tensors. Takes the `input` tensor, the current GPU's rank, and the - // total number of ranks `rank_num`. - Tensor *all_reduce(Tensor *input, int rank, int rank_num, - Tensor *output = nullptr, - const std::string &name = "all_reduce"); - // Performs an all-gather operator across all ranks, aggregating the input - // tensors. Takes the `input` tensor, the current GPU's rank, and the - // total number of ranks `rank_num`. Returns a vector of tensors, each - // containing the aggregated data from all ranks. - std::vector all_gather(Tensor *input, int rank, int rank_num, - const std::vector &output = {}, - const std::string &name = "all_gather"); - /// Embedding layer. - Tensor *embedding(Tensor *input, Tensor *weight, Tensor *output = nullptr, - const std::string &name = "embedding"); - /// Tensor type casting. - Tensor *cast(Tensor *input, const TensorType &ttype, - Tensor *output = nullptr, const std::string &name = "cast"); - - // sync across multi devices - Tensor *device_sync(Tensor *input, int npeers, - const std::string &name = "device_sync"); - - // local reduce scatter - Tensor *local_reduce_scatter( - Tensor *input, int gpu_id, int ngpus_per_node, - const std::string &name = "local_reduce_scatter"); - - // local all gather - Tensor *local_all_gather(Tensor *input, int gpu_id, int ngpus_per_node, - int axis = 0, - const std::string &name = "local_all_gather"); - // read data from remote and reduce to current buffer - Tensor *read_and_reduce(Tensor *input, int sid, int npeers, size_t offset, - size_t bytes, - const std::string &name = "read_and_reduce"); - // gather from peers - Tensor *gather_from_peers(Tensor *input, Tensor *tile, int sid, int npeers, - size_t chunkBytes, - const std::string &name = "gather_from_peers"); - - Tensor *local_all_reduce(Tensor *input, int gpu_id, int gpu_num, - const std::string &name = "local_all_reduce"); - Tensor *local_all_reduce_packet( - Tensor *input, int gpu_id, int gpu_num, - const std::string &name = "local_all_reduce_packet"); - - Tensor *reduce_and_write_packet( - Tensor *input, Tensor *scratch, Tensor *output, - const std::vector &remote_peer_bufs, int id, int rank, - int npeers, size_t elems_per_rank, size_t scratch_offset, - size_t remote_dst_offset, int flag, - const std::string &name = "reduce_and_write_packet"); - Tensor *get_packet(Tensor *input, Tensor *output, size_t src_offset, - size_t dst_offset, size_t npackets, int flag, - const std::string &name = "get_packet"); - - protected: - class Impl; - friend class DefaultScheduler; - - private: - std::unique_ptr impl; -}; - -} // namespace ark - -#endif // ARK_MODEL_H diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp new file mode 100644 index 000000000..bc9fa63b1 --- /dev/null +++ b/ark/include/ark/model.hpp @@ -0,0 +1,315 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_HPP +#define ARK_MODEL_HPP + +#include +#include + +#include "dims.hpp" +#include "model_graph.hpp" +#include "model_ref.hpp" + +namespace ark { + +class ModelDataT; +using ModelDataType = std::shared_ptr; + +extern const ModelDataType NONE; +extern const ModelDataType FP32; +extern const ModelDataType FP16; +extern const ModelDataType BF16; +extern const ModelDataType INT32; +extern const ModelDataType UINT32; +extern const ModelDataType INT8; +extern const ModelDataType UINT8; +extern const ModelDataType BYTE; + +class Model : public ModelGraph { + private: + int rank_; + + public: + Model(int rank = 0) : rank_(rank) {} + Model(const Model &other) : ModelGraph(other), rank_(other.rank()) {} + ~Model() {} + + Model &operator=(const Model &other) = default; + + int rank() const { return rank_; } + + Model compress() const; + + /// Returns a tensor object. + /// + /// @param shape Shape of the tensor, where the data of interest is. + /// @param dtype Type of the tensor data. + /// @param strides Leading dimensions (ldim) of the tensor, which may be + /// different from the shape. @p strides can be considered as the actual + /// shape of the underlying data buffer (@ref TensorBuf). + /// @param offsets Offsets of the tensor. The data of interest starts at + /// @p offsets and ends at @p offsets + @p shape. + /// @param pads If a dimension of @p pads is set to larger than 1, the + /// corresponding ldim will be set to the minimum multiple of @p pads that + /// is larger than or equal to the previous ldim. Padding is accumulated + /// across all tensors that share the same @ref TensorBuf. For example, if + /// one tensor sets the last dimension of @p pads to 2, and another tensor + /// sets the last dimension of @p pads to 3, then the corresponding ldim + /// will be the minimum multiple of 2x3=6 that is larger than or equal to + /// the corresponding dimension of @p offsets + @p shape. + /// @param exported Whether the tensor is exported to other processes. This + /// should be set to true if the tensor is used as an input or output of a + /// remote process. + /// @param imported_rank The rank of the process that exports the tensor. + /// If @p imported_rank is set to a non-negative value, the tensor will be + /// considered as a remote tensor, hence no memory will be allocated for it + /// on the local. @p imported_rank should be set to -1 if the tensor resides + /// on the local. + /// @param name Name of the tensor. + /// @return Pointer to a tensor object. + /// + ModelTensorRef tensor(const Dims &shape, ModelDataType data_type, + const Dims &strides = {}, const Dims &offsets = {}, + const Dims &pads = {}, bool exported = false, + int imported_rank = -1, const std::string &name = ""); + + ModelTensorRef refer(ModelTensorRef input, const Dims &shape = {}, + const Dims &strides = {}, const Dims &offsets = {}, + const Dims &pads = {}, const std::string &name = ""); + + ModelTensorRef reshape(ModelTensorRef input, const Dims &shape, + bool allowzero = false, + ModelTensorRef output = nullptr, + const std::string &name = ""); + ModelTensorRef reshape(ModelTensorRef input, + const std::initializer_list &shape, + bool allowzero = false, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be + // inferred from the `input`. If one dimension of `shape` is 0, by default + // (`allowzero` is false), that dimension is unchanged from the + // corresponding one of `input`. If `allowzero` is true, that dimension is + // set to 0, which means that the reshaped tensor is an empty tensor, i.e., + // `input` should also be an empty tensor. If `allowzero` is true, `shape` + // should not include both 0 and -1 at the same time. If `shape` is an empty + // vector, `input` will be converted to a scalar. + ModelTensorRef reshape(ModelTensorRef input, + const std::vector &shape, + bool allowzero = false, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Returns an identical tensor of `input` with execution dependencies + // `deps`. + ModelTensorRef identity(ModelTensorRef input, + const std::vector &deps = {}, + const std::string &name = ""); + + // Shard `input` along `axis` into `dim_per_shard`-dimensional shards. + std::vector sharding(ModelTensorRef input, DimType axis, + DimType dim_per_shard, + const std::string &name = ""); + // Performs reduction along the `axis` of the `input` tensor and stores the + // result in `output`. + // Currently, only reduction along the last dimension is supported. + template + ModelTensorRef reduce(ModelTensorRef input, int axis, bool keepdims = true, + ModelTensorRef output = nullptr, + const std::string &name = ""); + ModelTensorRef reduce_sum(ModelTensorRef input, int axis, + bool keepdims = true, + ModelTensorRef output = nullptr, + const std::string &name = ""); + ModelTensorRef reduce_mean(ModelTensorRef input, int axis, + bool keepdims = true, + ModelTensorRef output = nullptr, + const std::string &name = ""); + ModelTensorRef reduce_max(ModelTensorRef input, int axis, + bool keepdims = true, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Applies layer normalization to the `input` tensor and returns the + // normalized tensor as `output`. + ModelTensorRef layernorm(ModelTensorRef input, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Transposes the `input` tensor according to the given `perm` permutation. + // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two + // dimensions of the input tensor. Currently, only 4D tensors are supported. + ModelTensorRef transpose(ModelTensorRef input, Dims perm, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs matrix multiplication between the `input` tensor and another + // `other` tensor, storing the result in `output`. + ModelTensorRef matmul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + bool trans_input = false, bool trans_other = false, + const std::string &name = ""); + // Implements the 'im2col' method for 2D convolution layers, which takes an + // `input` tensor and reshapes it to a 2D matrix by extracting image patches + // from the input tensor based on the provided parameters. + ModelTensorRef im2col(ModelTensorRef input, int kernel_height, + int kernel_width, int stride_height, int stride_width, + int pad_height, int pad_width, int dilation_height, + int dilation_width, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Applies max-pooling on the `input` tensor using `kernel_size` and + // `stride`, reducing its spatial size. The output shape is calculated based + // on the input tensor's shape and the stride value as follows: {is[0], + // (is[1] + stride - 1) / stride, (is[2] + stride - 1) / stride, is[3]}, + // where 'is' represents the input tensor's shape. + ModelTensorRef max_pool(ModelTensorRef input, DimType kernel_size, + DimType stride, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Multiplies the `input` tensor by a scalar `val`, element-wise. + ModelTensorRef scale(ModelTensorRef input, float val, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // + template + ModelTensorRef math(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Calculates the exponential of the `input` tensor, element-wise. + ModelTensorRef exp(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Calculates the square root of the `input` tensor, element-wise. + ModelTensorRef sqrt(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Calculates the reverse square root of the `input` tensor, element-wise. + ModelTensorRef rsqrt(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // ReLU activation + ModelTensorRef relu(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Copy the `input` tensor to `output` tensor + ModelTensorRef copy(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Applies the Gaussian Error Linear Unit (GELU) activation function to the + // `input` tensor, element-wise. GELU is a smooth approximation of the + // rectifier function and is widely used in deep learning models. + ModelTensorRef gelu(ModelTensorRef input, ModelTensorRef output = nullptr, + const std::string &name = ""); + // Sigmoid activation + ModelTensorRef sigmoid(ModelTensorRef input, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs rotary position embedding (RoPE) on the `input` tensor + ModelTensorRef rope(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + const std::string &name = ""); + + // Performs an element-wise addition operator between the `input` tensor + // and the `other` tensor + ModelTensorRef add(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs an element-wise subtraction operator between the `input` tensor + // and the `other` tensor + ModelTensorRef sub(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs an element-wise multiplication operator between the `input` + // tensor and the `other` tensor, + ModelTensorRef mul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs an element-wise division operator between the `input` + // tensor and the `other` tensor, + ModelTensorRef div(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output = nullptr, + const std::string &name = ""); + /// Sends a tensor to a destination rank (@p dst_rank). Multiple tensors can + /// be sent to the same rank,so an identifier `id` is required to + /// distinguish the tensor. Each 'send' operator must have a corresponding + /// 'recv' operator that have the same id in another rank's model. + /// + /// @param input + /// @param id + /// @param dst_rank Rank of the GPU to send to. + /// @param bytes + /// @param name + /// @return + ModelTensorRef send(ModelTensorRef input, int sid, int dst_rank, + DimType bytes = 0, const std::string &name = ""); + // Blocks the execution until the corresponding 'send' operator with the + // specified `id` is completed. + ModelTensorRef send_done(ModelTensorRef input, int sid, int dst_rank, + const std::string &name = ""); + // Receives a tensor from a source rank (@p src_rank), identified by the + // `id` parameter. Blocks the execution until the corresponding 'recv' + // operator is completed. + ModelTensorRef recv(int sid, int src_rank, DimType bytes = 0, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // + ModelTensorRef put_packet(ModelTensorRef input, + ModelTensorRef local_tmp_buf, + ModelTensorRef recv_buf, int id, int rank, + int dst_rank, size_t dst_offset, int flag, + const std::string &name = ""); + // Performs an all-reduce operator across all ranks, aggregating the input + // tensors. Takes the `input` tensor, the current GPU's rank, and the + // total number of ranks `rank_num`. + ModelTensorRef all_reduce(ModelTensorRef input, int rank, int rank_num, + ModelTensorRef output = nullptr, + const std::string &name = ""); + // Performs an all-gather operator across all ranks, aggregating the input + // tensors. Takes the `input` tensor, the current GPU's rank, and the + // total number of ranks `rank_num`. Returns a vector of tensors, each + // containing the aggregated data from all ranks. + std::vector all_gather( + ModelTensorRef input, int rank, int rank_num, + const std::vector &output = {}, + const std::string &name = ""); + /// Embedding layer. + ModelTensorRef embedding(ModelTensorRef input, ModelTensorRef weight, + ModelTensorRef output = nullptr, + const std::string &name = ""); + /// Tensor type casting. + ModelTensorRef cast(ModelTensorRef input, ModelDataType data_type, + ModelTensorRef output = nullptr, + const std::string &name = ""); + + // sync across multi devices + ModelTensorRef device_sync(ModelTensorRef input, int npeers, + const std::string &name = ""); + + // local reduce scatter + ModelTensorRef local_reduce_scatter(ModelTensorRef input, int gpu_id, + int ngpus_per_node, + const std::string &name = ""); + + // local all gather + ModelTensorRef local_all_gather(ModelTensorRef input, int gpu_id, + int ngpus_per_node, int axis = 0, + const std::string &name = ""); + // read data from remote and reduce to current buffer + ModelTensorRef read_and_reduce(ModelTensorRef input, int sid, int npeers, + size_t offset, size_t bytes, + const std::string &name = ""); + // gather from peers + ModelTensorRef gather_from_peers(ModelTensorRef input, ModelTensorRef tile, + int sid, int npeers, size_t chunkBytes, + const std::string &name = ""); + + ModelTensorRef local_all_reduce(ModelTensorRef input, int gpu_id, + int gpu_num, const std::string &name = ""); + ModelTensorRef local_all_reduce_packet(ModelTensorRef input, int gpu_id, + int gpu_num, + const std::string &name = ""); + + ModelTensorRef reduce_and_write_packet( + ModelTensorRef input, ModelTensorRef scratch, ModelTensorRef output, + const std::vector &remote_peer_bufs, int id, int rank, + int npeers, size_t elems_per_rank, size_t scratch_offset, + size_t remote_dst_offset, int flag, const std::string &name = ""); + ModelTensorRef get_packet(ModelTensorRef input, ModelTensorRef output, + size_t src_offset, size_t dst_offset, + size_t npackets, int flag, + const std::string &name = ""); +}; + +} // namespace ark + +#endif // ARK_MODEL_HPP diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp new file mode 100644 index 000000000..21ee4c328 --- /dev/null +++ b/ark/include/ark/model_graph.hpp @@ -0,0 +1,53 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_GRAPH_HPP +#define ARK_MODEL_GRAPH_HPP + +#include +#include +#include + +#include "model_ref.hpp" + +namespace ark { + +class ModelGraph { + public: + ModelGraph(); + + ModelGraph(const ModelGraph &other); + + ~ModelGraph(); + + ModelGraph &operator=(const ModelGraph &other); + + /// Break a @ref ModelNode into two @ref ModelNode. + /// + /// The original node will have the first @p op_idx ops, and the new node + /// will have the rest. + /// + /// @param node The @ref ModelNode to break. + /// @param op_idx The index of the first op in the new @ref ModelNode. + /// @return The new @ref ModelNode. + ModelNodeRef break_node(ModelNodeRef node, size_t op_idx); + + void compress_nodes(); + + bool verify() const; + + std::string serialize(int indent = -1) const; + + /// Get the list of @ref ModelNode in the graph. + std::vector nodes() const; + + protected: + friend class Model; + + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ark + +#endif // ARK_MODEL_GRAPH_HPP diff --git a/ark/include/ark/model_ref.hpp b/ark/include/ark/model_ref.hpp new file mode 100644 index 000000000..594a95772 --- /dev/null +++ b/ark/include/ark/model_ref.hpp @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_REF_HPP +#define ARK_MODEL_REF_HPP + +#include + +namespace ark { + +class ModelOp; +using ModelOpRef = std::shared_ptr; + +class ModelBuffer; +using ModelBufferRef = std::shared_ptr; + +class ModelTensor; +using ModelTensorRef = std::shared_ptr; + +class ModelNode; +using ModelNodeRef = std::shared_ptr; + +} // namespace ark + +#endif // ARK_MODEL_REF_HPP diff --git a/ark/include/ark/random.hpp b/ark/include/ark/random.hpp new file mode 100644 index 000000000..2b1a6d8a8 --- /dev/null +++ b/ark/include/ark/random.hpp @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_RANDOM_HPP +#define ARK_RANDOM_HPP + +#include + +namespace ark { + +// set random seed +void srand(int seed = -1); + +// get random number +int rand(); + +/// Generate a random value. +template +T rand(float min_val, float max_val) { + int mid = RAND_MAX / 2; + return T((ark::rand() - mid) / (float)mid * (max_val - min_val) + min_val); +} + +} // namespace ark + +#endif // ARK_RANDOM_HPP diff --git a/ark/schedule/schedule.h b/ark/include/ark/schedule.hpp similarity index 92% rename from ark/schedule/schedule.h rename to ark/include/ark/schedule.hpp index 76465a3e2..0f809e54c 100644 --- a/ark/schedule/schedule.h +++ b/ark/include/ark/schedule.hpp @@ -1,14 +1,14 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_SCHEDULE_H_ -#define ARK_SCHEDULE_H_ +#ifndef ARK_SCHEDULE_HPP +#define ARK_SCHEDULE_HPP #include #include #include -#include "range.h" +#include "range.hpp" namespace ark { @@ -68,4 +68,4 @@ class Schedule { } // namespace ark -#endif // ARK_SCHEDULE_H_ +#endif // ARK_SCHEDULE_HPP diff --git a/ark/include/ark/version.hpp b/ark/include/ark/version.hpp new file mode 100644 index 000000000..e8ba54583 --- /dev/null +++ b/ark/include/ark/version.hpp @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_VERSION_HPP +#define ARK_VERSION_HPP + +#include + +#define ARK_MAJOR 0 +#define ARK_MINOR 5 +#define ARK_PATCH 0 +#define ARK_VERSION (ARK_MAJOR * 10000 + ARK_MINOR * 100 + ARK_PATCH) + +namespace ark { + +/// Return a version string. +std::string version(); + +} // namespace ark + +#endif // ARK_VERSION_HPP diff --git a/ark/include/kernels/common/arch.h b/ark/include/kernels/common/arch.h index e268ad78c..7eff95c7b 100644 --- a/ark/include/kernels/common/arch.h +++ b/ark/include/kernels/common/arch.h @@ -32,13 +32,13 @@ DEVICE int warp_id() { #if defined(ARK_TARGET_CUDA_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return cuda_func(std::forward(args)...); \ } #elif defined(ARK_TARGET_ROCM_ARCH) #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func) \ template \ - inline auto alias(Args &&... args) { \ + inline auto alias(Args &&...args) { \ return hip_func(std::forward(args)...); \ } #endif diff --git a/ark/include/kernels/common/static_math.h b/ark/include/kernels/common/static_math.h index 06a4552cf..b7093b8b7 100644 --- a/ark/include/kernels/common/static_math.h +++ b/ark/include/kernels/common/static_math.h @@ -150,6 +150,28 @@ static DEVICE long long int gm(long long int x) { return math::div(x) * Divisor; } +//////////////////////////////////////////////////////////////////////////////// + +template +DEVICE bool geq(size_t x) { + return x >= Rhs; +} + +template <> +DEVICE bool geq<0>(size_t x) { + return true; +} + +template +DEVICE bool le(size_t x) { + return x < Rhs; +} + +template <> +DEVICE bool le<0>(size_t x) { + return false; +} + } // namespace math } // namespace ark diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h index 0d07f8ed2..85f7639c9 100644 --- a/ark/include/kernels/common/sync.h +++ b/ark/include/kernels/common/sync.h @@ -86,7 +86,7 @@ DEVICE void sync_warps() { __syncwarp(); } else if constexpr (NumWarps == 2) { static_assert( - ARK_THREADS_PER_BLOCK <= 512, + ARK_WARPS_PER_BLOCK <= 16, "2-warp barrier is not supported for block sizes larger than 512"); asm volatile("barrier.sync %0, 64;" ::"r"((threadIdx.x >> 6) + 8)); } else if constexpr (NumWarps == 4) { diff --git a/ark/include/kernels/common/vector_type.h b/ark/include/kernels/common/vector_type.h index e86a50145..379c79db1 100644 --- a/ark/include/kernels/common/vector_type.h +++ b/ark/include/kernels/common/vector_type.h @@ -77,8 +77,8 @@ struct IntrinsicCompute1Exists { template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero()))::value; }; template @@ -90,9 +90,9 @@ struct IntrinsicCompute2Exists { template static auto test(...) -> std::false_type; - static constexpr bool value = decltype( - test(type::Constant::zero(), - type::Constant::zero()))::value; + static constexpr bool value = decltype(test( + type::Constant::zero(), + type::Constant::zero()))::value; }; template @@ -197,11 +197,10 @@ struct DefaultNelemPerThread { : UnitOutDims::W; static const int value = - (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) - ? 8 - : (ConsecutiveDimLen % 4 == 0) - ? 4 - : (ConsecutiveDimLen % 2 == 0) ? 2 : 1; + (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) ? 8 + : (ConsecutiveDimLen % 4 == 0) ? 4 + : (ConsecutiveDimLen % 2 == 0) ? 2 + : 1; }; } // namespace ark diff --git a/ark/include/kernels/gemm_ck.h b/ark/include/kernels/gemm_ck.h index 4054f2d37..05a6a23dc 100644 --- a/ark/include/kernels/gemm_ck.h +++ b/ark/include/kernels/gemm_ck.h @@ -90,13 +90,15 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -197,13 +199,15 @@ struct CkGemmConfig, typename std::conditional, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, - (IsColA ? 1 : 2), (!IsColA ? 8 : Is_128x128x64 ? 4 : MXdlPerWave), 8, - true, S<4, NumThreads / 4, 1>, + (IsColA ? 1 : 2), + (!IsColA ? 8 + : Is_128x128x64 ? 4 + : MXdlPerWave), + 8, true, S<4, NumThreads / 4, 1>, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), - (IsColB ? 8 - : Is_128x32x256 - ? 8 - : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) - ? 4 - : (Is_128x32x64 || Is_64x32x32) ? 2 : NXdlPerWave), + (IsColB ? 8 + : Is_128x32x256 ? 8 + : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) ? 4 + : (Is_128x32x64 || Is_64x32x32) ? 2 + : NXdlPerWave), 8, true, 7, 1, 1, LoopSched, PipelineVer>; using ImplXdlCShuffle = @@ -234,16 +240,17 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || @@ -255,16 +262,17 @@ struct CkGemmConfig; #if (DEBUG_CK != 0) - PrintDeviceGemmXdlCShuffle< - NumThreads, TileSizeM, TileSizeN, 32, AK1, BK1, 32, 32, MXdlPerWave, - NXdlPerWave, - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), - (IsColB - ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) - ? 4 - : NXdlPerWave), - 1, 1> + PrintDeviceGemmXdlCShuffle p; #endif // (DEBUG_CK != 0) }; @@ -286,9 +294,9 @@ struct CkGemmConfig::value; static constexpr auto MXdlPerWave = (TileSizeM == 16) ? 1 - : (TileSizeM < TileSizeN) - ? 1 << (LogMNXdlPerWave / 2) - : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); + : (TileSizeM < TileSizeN) + ? 1 << (LogMNXdlPerWave / 2) + : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2); static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave; static constexpr bool Is_256x256x128 = @@ -307,7 +315,8 @@ struct CkGemmConfig, S<1, 0, 2>>::type, typename std::conditional, S<1, 0, 2>>::type, (IsColA ? 1 : 2), - (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1, - (AK1 == 8), S, + (!IsColA ? 8 + : (AK1 == 2 || Is_128x128x64) ? 4 + : MXdlPerWave), + AK1, (AK1 == 8), S, typename std::conditional, S<0, 2, 1>>::type, typename std::conditional, S<0, 2, 1>>::type, (IsColB ? 2 : 1), (IsColB ? 8 - : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || - Is_128x64x128) - ? 4 - : NXdlPerWave), + : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128) + ? 4 + : NXdlPerWave), BK1, (BK1 == 8), 1, 1, S<1, (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 || diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in new file mode 100644 index 000000000..13e1751fe --- /dev/null +++ b/ark/include/kernels/kernel_template.in @@ -0,0 +1,65 @@ +// THIS KERNEL IS MACHINE-GENERATED BY ARK. +#define ARK_WARPS_PER_BLOCK @NUM_WARPS_PER_BLOCK@ +#include "ark_kernels.h" +using namespace ark; + +template +__forceinline__ __device__ void task_seq(char *_buf) { + if (math::geq(blockIdx.x) && math::le(blockIdx.x) && + ((blockIdx.x - ProcBegin) % ProcStep == 0)) { + constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp; + constexpr size_t NumProcs = (ProcEnd - ProcBegin + ProcStep - 1) / ProcStep; + constexpr size_t SramBytesPerWarp = SlotSramBytes / SlotNumWarps; + size_t p = ((blockIdx.x + gridDim.x - ProcCurrent) % gridDim.x) / ProcStep; + size_t k = threadIdx.x / SlotNumThreads; + size_t task_id_base = TaskBegin + p * TaskStep * TaskGranularity; + for (size_t t = k; ; t += NumSlots) { + size_t task_id = task_id_base + TaskStep * + (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs); + if (task_id >= TaskEnd) break; + task(_buf, task_id, SramBytesPerWarp); + } + __syncthreads(); + } +} + +__device__ int ARK_ITER = 0; +__device__ sync::State ARK_LOOP_SYNC_STATE; +__device__ char *ARK_BUF; + +@DEFINITIONS@ + +__device__ void ark_loop_body(char *_buf, int _iter) { +@BODY@ +} + +extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1) +void @NAME@(int *_iter) { + char *_buf = ARK_BUF; + int *shared_mem = (int *)_ARK_SMEM; + for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) { + shared_mem[i] = 0; + } + for (;;) { + if (threadIdx.x == 0 && blockIdx.x == 0) { + int iter; + while ((iter = atomicLoadRelaxed(_iter)) == 0) {} + ARK_ITER = iter; + } + sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); + if (ARK_ITER < 0) return; + + ark_loop_body(_buf, 0); + for (int _i = 1; _i < ARK_ITER; ++_i) { + sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); + ark_loop_body(_buf, _i); + } + if (threadIdx.x == 0 && blockIdx.x == 0) { + atomicStoreRelaxed(_iter, 0); + } + sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE); + } +} diff --git a/ark/ark.cc b/ark/init.cpp similarity index 91% rename from ark/ark.cc rename to ark/init.cpp index 2392dcc83..0e6adcc4f 100644 --- a/ark/ark.cc +++ b/ark/init.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "include/ark.h" +#include "ark/init.hpp" #include #include @@ -15,12 +15,6 @@ namespace ark { -std::string version() { - std::stringstream ss; - ss << ARK_MAJOR << "." << ARK_MINOR << "." << ARK_PATCH; - return ss.str(); -} - void init() { LOG(DEBUG, "init ark"); diff --git a/ark/ark_test.cc b/ark/init_test.cpp similarity index 50% rename from ark/ark_test.cc rename to ark/init_test.cpp index 66dd6e8c2..5dede7138 100644 --- a/ark/ark_test.cc +++ b/ark/init_test.cpp @@ -1,40 +1,28 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "include/ark.h" +#include "ark/init.hpp" #include "file_io.h" #include "unittest/unittest_utils.h" -ark::unittest::State test_ark_version() { - auto version = ark::version(); - - // Check if the version string is in the correct format. - auto dot1 = version.find('.'); - auto dot2 = version.find('.', dot1 + 1); - UNITTEST_NE(dot1, std::string::npos); - UNITTEST_NE(dot2, std::string::npos); - - return ark::unittest::SUCCESS; -} - -ark::unittest::State test_ark_init() { +ark::unittest::State test_init() { // invalid tmp directory ::setenv("ARK_TMP", "", 1); UNITTEST_THROW(ark::init(), ark::SystemError); // create a tmp directory - ::setenv("ARK_TMP", "/tmp/ark/.test_ark_init", 1); + ::setenv("ARK_TMP", "/tmp/ark/.test_init", 1); ::setenv("ARK_KEEP_TMP", "1", 1); ark::init(); // create a tmp file - ark::write_file("/tmp/ark/.test_ark_init/test", "test"); + ark::write_file("/tmp/ark/.test_init/test", "test"); // clear the tmp directory ::setenv("ARK_KEEP_TMP", "0", 1); ark::init(); - UNITTEST_TRUE(!ark::is_exist("/tmp/ark/.test_ark_init/test")); + UNITTEST_TRUE(!ark::is_exist("/tmp/ark/.test_init/test")); // given tmp directory is not a directory ::setenv("ARK_TMP", "/dev/null", 1); @@ -44,7 +32,6 @@ ark::unittest::State test_ark_init() { } int main() { - UNITTEST(test_ark_version); - UNITTEST(test_ark_init); + UNITTEST(test_init); return 0; } diff --git a/ark/ipc/ipc_coll.cc b/ark/ipc/ipc_coll.cpp similarity index 100% rename from ark/ipc/ipc_coll.cc rename to ark/ipc/ipc_coll.cpp diff --git a/ark/ipc/ipc_coll_test.cc b/ark/ipc/ipc_coll_test.cpp similarity index 96% rename from ark/ipc/ipc_coll_test.cc rename to ark/ipc/ipc_coll_test.cpp index 17ae39326..76adcedec 100644 --- a/ark/ipc/ipc_coll_test.cc +++ b/ark/ipc/ipc_coll_test.cpp @@ -3,7 +3,6 @@ #include "ipc/ipc_coll.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_ipc_coll_allgather() { @@ -31,7 +30,6 @@ ark::unittest::State test_ipc_coll_allgather() { } int main() { - ark::init(); UNITTEST(test_ipc_coll_allgather); return 0; } diff --git a/ark/ipc/ipc_hosts.cc b/ark/ipc/ipc_hosts.cpp similarity index 98% rename from ark/ipc/ipc_hosts.cc rename to ark/ipc/ipc_hosts.cpp index b41c315b0..a0a8bc417 100644 --- a/ark/ipc/ipc_hosts.cc +++ b/ark/ipc/ipc_hosts.cpp @@ -11,7 +11,6 @@ #include "env.h" #include "file_io.h" -#include "include/ark.h" #include "logging.h" namespace ark { diff --git a/ark/ipc/ipc_hosts_test.cc b/ark/ipc/ipc_hosts_test.cpp similarity index 97% rename from ark/ipc/ipc_hosts_test.cc rename to ark/ipc/ipc_hosts_test.cpp index 326f1c74b..410032852 100644 --- a/ark/ipc/ipc_hosts_test.cc +++ b/ark/ipc/ipc_hosts_test.cpp @@ -5,7 +5,6 @@ #include "env.h" #include "file_io.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_ipc_hosts() { @@ -42,7 +41,6 @@ ark::unittest::State test_ipc_hosts_unknown_host() { } int main() { - ark::init(); UNITTEST(test_ipc_hosts); UNITTEST(test_ipc_hosts_unknown_host); return 0; diff --git a/ark/ipc/ipc_lock.cc b/ark/ipc/ipc_lock.cpp similarity index 98% rename from ark/ipc/ipc_lock.cc rename to ark/ipc/ipc_lock.cpp index 50381a2e7..d66e91a61 100644 --- a/ark/ipc/ipc_lock.cc +++ b/ark/ipc/ipc_lock.cpp @@ -5,7 +5,6 @@ #include -#include "include/ark.h" #include "logging.h" namespace ark { diff --git a/ark/ipc/ipc_mem.cc b/ark/ipc/ipc_mem.cpp similarity index 99% rename from ark/ipc/ipc_mem.cc rename to ark/ipc/ipc_mem.cpp index babb4df4a..52bd033b1 100644 --- a/ark/ipc/ipc_mem.cc +++ b/ark/ipc/ipc_mem.cpp @@ -13,7 +13,6 @@ #include "cpu_timer.h" #include "env.h" -#include "include/ark.h" #include "ipc/ipc_shm.h" #include "logging.h" diff --git a/ark/ipc/ipc_mem_test.cc b/ark/ipc/ipc_mem_test.cpp similarity index 99% rename from ark/ipc/ipc_mem_test.cc rename to ark/ipc/ipc_mem_test.cpp index c25f90f88..e151b4d2f 100644 --- a/ark/ipc/ipc_mem_test.cc +++ b/ark/ipc/ipc_mem_test.cpp @@ -3,7 +3,6 @@ #include "ipc/ipc_mem.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_ipc_mem_lock_simple() { @@ -153,7 +152,6 @@ ark::unittest::State test_ipc_mem_realloc() { } int main() { - ark::init(); UNITTEST(test_ipc_mem_lock_simple); UNITTEST(test_ipc_mem_lock_many); UNITTEST(test_ipc_mem_finishing); diff --git a/ark/ipc/ipc_shm.cc b/ark/ipc/ipc_shm.cpp similarity index 99% rename from ark/ipc/ipc_shm.cc rename to ark/ipc/ipc_shm.cpp index d00075914..fa8641f74 100644 --- a/ark/ipc/ipc_shm.cc +++ b/ark/ipc/ipc_shm.cpp @@ -16,7 +16,6 @@ #include #include -#include "include/ark.h" #include "logging.h" #define SHM_DIR "/dev/shm/" diff --git a/ark/ipc/ipc_socket.cc b/ark/ipc/ipc_socket.cpp similarity index 99% rename from ark/ipc/ipc_socket.cc rename to ark/ipc/ipc_socket.cpp index 43f64d9b3..6bb0a7b98 100644 --- a/ark/ipc/ipc_socket.cc +++ b/ark/ipc/ipc_socket.cpp @@ -11,7 +11,6 @@ #include -#include "include/ark.h" #include "logging.h" #define MAX_LISTEN_LEN 4096 diff --git a/ark/ipc/ipc_socket_test.cc b/ark/ipc/ipc_socket_test.cpp similarity index 98% rename from ark/ipc/ipc_socket_test.cc rename to ark/ipc/ipc_socket_test.cpp index 26257a235..dd2dc8119 100644 --- a/ark/ipc/ipc_socket_test.cc +++ b/ark/ipc/ipc_socket_test.cpp @@ -4,7 +4,6 @@ #include "ipc/ipc_socket.h" #include "env.h" -#include "include/ark.h" #include "ipc/ipc_hosts.h" #include "logging.h" #include "unittest/unittest_utils.h" @@ -124,7 +123,6 @@ ark::unittest::State test_ipc_socket_no_item() { } int main() { - ark::init(); UNITTEST(test_ipc_socket_simple); UNITTEST(test_ipc_socket_no_item); return ark::unittest::SUCCESS; diff --git a/ark/logging.cc b/ark/logging.cpp similarity index 100% rename from ark/logging.cc rename to ark/logging.cpp diff --git a/ark/logging.h b/ark/logging.h index d84795315..d29793ff7 100644 --- a/ark/logging.h +++ b/ark/logging.h @@ -8,6 +8,8 @@ #include #include +#include "error.hpp" + namespace ark { typedef enum { DEBUG, INFO, WARN, ERROR } LogLevel; diff --git a/ark/math_utils.cc b/ark/math_utils.cpp similarity index 98% rename from ark/math_utils.cc rename to ark/math_utils.cpp index 2f49d870a..3efa2d6a1 100644 --- a/ark/math_utils.cc +++ b/ark/math_utils.cpp @@ -3,7 +3,6 @@ #include "math_utils.h" -#include "include/ark.h" #include "logging.h" namespace ark { diff --git a/ark/math_utils.h b/ark/math_utils.h index cf1759a39..1780876da 100644 --- a/ark/math_utils.h +++ b/ark/math_utils.h @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_MATH_H_ -#define ARK_MATH_H_ +#ifndef ARK_MATH_UTILS_H_ +#define ARK_MATH_UTILS_H_ #include @@ -19,4 +19,4 @@ size_t lcm(size_t a, size_t b); } // namespace math } // namespace ark -#endif // ARK_MATH_H_ +#endif // ARK_MATH_UTILS_H_ diff --git a/ark/math_utils_test.cc b/ark/math_utils_test.cpp similarity index 98% rename from ark/math_utils_test.cc rename to ark/math_utils_test.cpp index d56b6c0d6..21c9f47a8 100644 --- a/ark/math_utils_test.cc +++ b/ark/math_utils_test.cpp @@ -3,7 +3,6 @@ #include "math_utils.h" -#include "include/ark.h" #include "unittest/unittest_utils.h" ark::unittest::State test_math() { @@ -102,7 +101,6 @@ ark::unittest::State test_math() { } int main() { - ark::init(); UNITTEST(test_math); return 0; } diff --git a/ark/model.cc b/ark/model.cc index 52c17d016..645b4ded4 100644 --- a/ark/model.cc +++ b/ark/model.cc @@ -19,19 +19,6 @@ namespace ark { -bool operator==(const OpArgType &lhs, const OpArgType &rhs) { - return lhs.id == rhs.id; -} - -bool operator!=(const OpArgType &lhs, const OpArgType &rhs) { - return !(lhs == rhs); -} - -std::ostream &operator<<(std::ostream &os, const OpArgType &type) { - os << type.name; - return os; -} - OpArchType op_arch_from_string(const std::string &arch) { if (arch == "cuda_60") { return OP_ARCH_CUDA_60; @@ -109,216 +96,14 @@ const std::vector &OpConfigMap::get(const OpConfigKey &key) const { return NoneConfigs; } -OpArg::OpArg(int arg) : type{OP_ARG_INT}, val{new int{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(DimType arg) : type{OP_ARG_INT64}, val{new DimType{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(uint64_t arg) : type{OP_ARG_UINT64}, val{new uint64_t{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(bool arg) : type{OP_ARG_BOOL}, val{new bool{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(float arg) : type{OP_ARG_FLOAT}, val{new float{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(const Dims &arg) : type{OP_ARG_DIMS}, val{new Dims{arg}} { - assert(this->val != nullptr); -} -OpArg::OpArg(Tensor *arg) : type{OP_ARG_TENSOR}, val{arg} { - assert(this->val != nullptr); -} -OpArg::OpArg(const OpArg &arg) : type{arg.type} { - if (this->type == OP_ARG_INT) { - this->val = new int{*(int *)arg.val}; - } else if (this->type == OP_ARG_INT64) { - this->val = new DimType{*(DimType *)arg.val}; - } else if (this->type == OP_ARG_UINT64) { - this->val = new uint64_t{*(uint64_t *)arg.val}; - } else if (this->type == OP_ARG_BOOL) { - this->val = new bool{*(bool *)arg.val}; - } else if (this->type == OP_ARG_FLOAT) { - this->val = new float{*(float *)arg.val}; - } else if (this->type == OP_ARG_DIMS) { - this->val = new Dims{*(Dims *)arg.val}; - } else if (this->type == OP_ARG_TENSOR) { - this->val = arg.val; - } else { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } -} -OpArg::~OpArg() { - if (this->type == OP_ARG_INT) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_INT64) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_UINT64) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_BOOL) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_FLOAT) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_DIMS) { - delete static_cast(this->val); - } else if (this->type == OP_ARG_TENSOR) { - // Do nothing - } -} -void OpArg::get(int *arg) const { - if (this->type != OP_ARG_INT) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(long long int *arg) const { - if (this->type != OP_ARG_INT64) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(uint64_t *arg) const { - if (this->type != OP_ARG_UINT64) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(bool *arg) const { - if (this->type != OP_ARG_BOOL) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(float *arg) const { - if (this->type != OP_ARG_FLOAT) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(Dims *arg) const { - if (this->type != OP_ARG_DIMS) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = *static_cast(this->val); -} - -void OpArg::get(Tensor **arg) const { - if (this->type != OP_ARG_TENSOR) { - ERR(InvalidUsageError, "invalid argument type ", this->type.name); - } - *arg = static_cast(this->val); -} - -OpArgs::OpArgs(const std::vector &args) : args{args} {} - -OpArgs &OpArgs::operator=(const OpArgs &opargs) { - if (this != &opargs) { - this->args = opargs.args; - } - return *this; -} - -void OpArgs::put(const OpArg &arg) { this->args.emplace_back(arg); } - -void OpArgs::get(int *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_INT) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(long long int *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_INT64) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(uint64_t *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_UINT64) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(bool *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_BOOL) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(float *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_FLOAT) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(Dims *arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_DIMS) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = *static_cast(this->args[idx].val); -} - -void OpArgs::get(Tensor **arg, size_t idx) const { - if (this->args.size() <= idx) { - ERR(InvalidUsageError, "invalid argument index ", idx, " size ", - this->args.size()); - } - if (this->args[idx].type != OP_ARG_TENSOR) { - ERR(InvalidUsageError, "invalid argument type ", - this->args[idx].type.name); - } - *arg = static_cast(this->args[idx].val); -} - -const std::vector &OpArgs::get_args() const { return this->args; } - bool operator==(const OpType &lhs, const OpType &rhs) { return lhs.id == rhs.id; } Op::Op(const OpType &type_, const std::string &prec_type_, const std::vector &inputs_, - const std::vector &output_refs_, const OpArgs &args_, + const std::vector &output_refs_, + const std::map &args_, const std::string &name_, const OpConfigMap *cfg_map_, int gran_lev_, bool force_inline_) : type{type_}, @@ -422,7 +207,7 @@ std::string Op::function_name(const OpConfig &cfg) const { return ""; } -OpArgs Op::function_call_args(const OpConfig &cfg) const { +std::vector Op::function_call_args(const OpConfig &cfg) const { if (this->type.id == OP_SCALE.id) { return static_cast(this)->function_call_args(cfg); } else if (this->type.id == OP_SEND.id) { @@ -448,11 +233,11 @@ OpArgs Op::function_call_args(const OpConfig &cfg) const { return static_cast(this)->function_call_args( cfg); } else { - OpArgs opargs; + std::vector opargs; std::vector deps = this->outputs; deps.insert(deps.end(), this->inputs.begin(), this->inputs.end()); for (Tensor *tns : deps) { - opargs.put(tns); + opargs.emplace_back(ModelOpArg(tns)); } return opargs; } @@ -461,38 +246,39 @@ OpArgs Op::function_call_args(const OpConfig &cfg) const { } std::string Op::function_name(const std::string &kernel_name, - const OpArgs &template_args) { + const std::vector &template_args) { std::stringstream ss; ss << kernel_name; - size_t num_args = template_args.args.size(); + size_t num_args = template_args.size(); if (num_args == 0) { return ss.str(); } ss << "<"; for (size_t i = 0; i < num_args; ++i) { - auto &arg = template_args.args[i]; - if (arg.type == OP_ARG_INT) { + auto &arg = template_args[i]; + if (arg.type_name() == "INT") { int val; - template_args.get(&val, i); + arg.get_value(&val); ss << val; - } else if (arg.type == OP_ARG_INT64) { + } else if (arg.type_name() == "INT64") { long long int val; - template_args.get(&val, i); + arg.get_value(&val); ss << val; - } else if (arg.type == OP_ARG_UINT64) { + } else if (arg.type_name() == "UINT64") { uint64_t val; - template_args.get(&val, i); + arg.get_value(&val); ss << val; - } else if (arg.type == OP_ARG_BOOL) { + } else if (arg.type_name() == "BOOL") { bool val; - template_args.get(&val, i); + arg.get_value(&val); ss << (val ? "true" : "false"); - } else if (arg.type == OP_ARG_FLOAT) { + } else if (arg.type_name() == "FLOAT") { ERR(ModelError, "float template args are not supported"); - } else if (arg.type == OP_ARG_DIMS) { + } else if (arg.type_name() == "DIMS") { Dims val; - template_args.get(&val, i); + arg.get_value(&val); ss << "ark::Vec" << val; + } else { } if (i < num_args - 1) { ss << ", "; @@ -542,8 +328,8 @@ void Model::Impl::destroy_tensor_buf(const TensorBuf *buf) { std::vector Model::Impl::add_op( const OpType type, const std::string &prec_type, const std::vector &inputs, const std::vector &outputs, - const OpArgs &args, const std::string &name, const OpConfigMap *cfg_map, - int gran_lev) { + const std::map &args, const std::string &name, + const OpConfigMap *cfg_map, int gran_lev) { Op op{type, prec_type, inputs, outputs, args, name, cfg_map, gran_lev}; return this->add_op(op); } @@ -1163,6 +949,17 @@ bool Model::Impl::depends_on(OpNode *node1, OpNode *node2) const { return false; } +nlohmann::json to_json(const Dims &dims) { + if (dims.is_invalid()) { + ERR(InvalidUsageError, "invalid dims given"); + } + nlohmann::json j; + for (auto i = 0; i < dims.ndims(); ++i) { + j[i] = dims[i]; + } + return j; +} + nlohmann::json to_json(const TensorBuf &tensor_buf) { nlohmann::json j; j["Id"] = tensor_buf.id; @@ -1173,51 +970,54 @@ nlohmann::json to_json(const TensorBuf &tensor_buf) { nlohmann::json to_json(const Tensor &tensor) { nlohmann::json j; j["Id"] = tensor.id; - j["TensorBuf"] = to_json(*(tensor.buf)); + j["BufferId"] = tensor.buf->id; j["TensorType"] = tensor.type.type_str(); - j["Shape"] = tensor.shape.serialize(); - j["Strides"] = tensor.ldims.serialize(); - j["Offsets"] = tensor.offs.serialize(); + j["Shape"] = to_json(tensor.shape); + if (tensor.shape != tensor.ldims) { + j["Strides"] = to_json(tensor.ldims); + } + if (!tensor.offs.is_zeros()) { + j["Offsets"] = to_json(tensor.offs); + } if (tensor.imported_rank >= 0) { j["ImportedRank"] = tensor.imported_rank; } return j; } -nlohmann::json to_json(const OpArg &op_arg) { +nlohmann::json to_json(const ModelOpArgT &op_arg) { nlohmann::json j; - j["Type"] = op_arg.type.name; - if (op_arg.type == OP_ARG_TENSOR) { + auto type_name = op_arg.type_name(); + if (type_name == "TENSOR") { Tensor *tns; - op_arg.get(&tns); - j["Value"] = tns->id; - } else if (op_arg.type == OP_ARG_FLOAT) { + op_arg.get_value(&tns); + j[type_name] = tns->id; + } else if (type_name == "FLOAT") { float val; - op_arg.get(&val); - j["Value"] = val; - } else if (op_arg.type == OP_ARG_INT) { + op_arg.get_value(&val); + j[type_name] = val; + } else if (type_name == "INT") { int val; - op_arg.get(&val); - j["Value"] = val; - } else if (op_arg.type == OP_ARG_BOOL) { + op_arg.get_value(&val); + j[type_name] = val; + } else if (type_name == "BOOL") { bool val; - op_arg.get(&val); - j["Value"] = val; - } else if (op_arg.type == OP_ARG_INT64) { + op_arg.get_value(&val); + j[type_name] = val; + } else if (type_name == "INT64") { long long int val; - op_arg.get(&val); - j["Value"] = val; - } else if (op_arg.type == OP_ARG_UINT64) { + op_arg.get_value(&val); + j[type_name] = val; + } else if (type_name == "UINT64") { uint64_t val; - op_arg.get(&val); - j["Value"] = val; - } else if (op_arg.type == OP_ARG_DIMS) { + op_arg.get_value(&val); + j[type_name] = val; + } else if (type_name == "DIMS") { Dims dims; - op_arg.get(&dims); - j["Value"] = dims.serialize(); + op_arg.get_value(&dims); + j[type_name] = to_json(dims); } else { - throw std::runtime_error("unexpected OpArg: " + - std::string(op_arg.type.name)); + throw std::runtime_error("unexpected OpArg: " + type_name); } return j; } @@ -1225,22 +1025,22 @@ nlohmann::json to_json(const OpArg &op_arg) { nlohmann::json to_json(const Op &op) { nlohmann::json j; j["Type"] = op.type.name; - j["PrecisionType"] = op.prec_type; - j["InputTensors"] = nlohmann::json(); + j["Precision"] = op.prec_type; + j["InputTensorIds"] = nlohmann::json::array(); for (auto tensor : op.inputs) { - j["InputTensors"].emplace_back(to_json(*tensor)); + j["InputTensorIds"].emplace_back(tensor->id); } - j["OutputTensors"] = nlohmann::json(); + j["OutputTensorIds"] = nlohmann::json::array(); for (auto tensor : op.inputs) { - j["OutputTensors"].emplace_back(to_json(*tensor)); + j["OutputTensorIds"].emplace_back(tensor->id); } - j["OutputRefTensors"] = nlohmann::json(); + j["OutputRefTensorIds"] = nlohmann::json::array(); for (auto tensor : op.inputs) { - j["OutputRefTensors"].emplace_back(to_json(*tensor)); + j["OutputRefTensorIds"].emplace_back(tensor->id); } - j["Args"] = nlohmann::json(); - for (auto arg : op.args.get_args()) { - j["Args"].emplace_back(to_json(arg)); + j["Args"] = nlohmann::json::object(); + for (const auto &p : op.args) { + j["Args"][p.first] = to_json(p.second); } return j; } @@ -1249,15 +1049,11 @@ nlohmann::json to_json(const OpNode &node, const std::map &node2id) { nlohmann::json j; j["Id"] = node2id.at(&node); - j["Ops"] = nlohmann::json(); + j["Ops"] = nlohmann::json::array(); for (auto op : node.ops) { j["Ops"].emplace_back(to_json(*op)); } - j["ConsumerNodeIds"] = nlohmann::json(); - for (auto user : node.users) { - j["ConsumerNodeIds"].emplace_back(node2id.at(user)); - } - j["ProducerNodeIds"] = nlohmann::json(); + j["ProducerNodeIds"] = nlohmann::json::array(); for (auto producer : node.producers) { j["ProducerNodeIds"].emplace_back(node2id.at(producer)); } @@ -1271,10 +1067,18 @@ std::string Model::Impl::serialize(int indent) const { node2id[node.get()] = id++; } nlohmann::json j; - j["Nodes"] = nlohmann::json(); + j["Nodes"] = nlohmann::json::array(); for (const auto &node : this->get_nodes()) { j["Nodes"].emplace_back(to_json(*node, node2id)); } + j["TensorBufs"] = nlohmann::json::array(); + for (const auto &tbuf : this->tns_bufs_storage) { + j["TensorBufs"].emplace_back(to_json(*tbuf)); + } + j["Tensors"] = nlohmann::json::array(); + for (const auto &tns : this->tns_storage) { + j["Tensors"].emplace_back(to_json(*tns)); + } return j.dump(indent); } diff --git a/ark/model.h b/ark/model.h index f97aae4ec..63f50d848 100644 --- a/ark/model.h +++ b/ark/model.h @@ -48,7 +48,8 @@ class Model::Impl { const std::string &prec_type, const std::vector &inputs, const std::vector &output_refs, - const OpArgs &args, const std::string &name, + const std::map &args, + const std::string &name, const OpConfigMap *cfg_map, int gran_lev = -1); /// Add a new @ref Op to the model. diff --git a/ark/model/model.cpp b/ark/model/model.cpp new file mode 100644 index 000000000..e3111ad45 --- /dev/null +++ b/ark/model/model.cpp @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" + +namespace ark { + +Model Model::compress() const { + Model model(*this); + model.compress_nodes(); + return model; +} + +} // namespace ark diff --git a/ark/model/model_data_type.cpp b/ark/model/model_data_type.cpp new file mode 100644 index 000000000..a62873ab2 --- /dev/null +++ b/ark/model/model_data_type.cpp @@ -0,0 +1,58 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_data_type.hpp" + +#include + +#include "bfloat16.h" +#include "half.h" +#include "logging.h" + +namespace ark { + +/// +/// NOTE: how to add a new data type +/// 1. Add an instance using `MODEL_DATA_TYPE_INSTANCE()` macro. +/// 2. Add a registration using `MODEL_DATA_TYPE_REGISTER()` macro. +/// 3. Expose the symbol in `include/ark/model.hpp`. +/// + +#define MODEL_DATA_TYPE_INSTANCE(_name, _type) \ + extern const ModelDataType _name = \ + std::make_shared(#_name, #_type, sizeof(_type)); + +#define MODEL_DATA_TYPE_REGISTER(_name) instances[#_name] = _name; + +extern const ModelDataType NONE = + std::make_shared("NONE", "void", 0); +MODEL_DATA_TYPE_INSTANCE(FP32, float); +MODEL_DATA_TYPE_INSTANCE(FP16, fp16); +MODEL_DATA_TYPE_INSTANCE(BF16, bf16); +MODEL_DATA_TYPE_INSTANCE(INT32, int32_t); +MODEL_DATA_TYPE_INSTANCE(UINT32, uint32_t); +MODEL_DATA_TYPE_INSTANCE(INT8, int8_t); +MODEL_DATA_TYPE_INSTANCE(UINT8, uint8_t); +MODEL_DATA_TYPE_INSTANCE(BYTE, char); + +const ModelDataType ModelDataT::from_name(const std::string &type_name) { + static std::map instances; + if (instances.empty()) { + MODEL_DATA_TYPE_REGISTER(NONE); + MODEL_DATA_TYPE_REGISTER(FP32); + MODEL_DATA_TYPE_REGISTER(FP16); + MODEL_DATA_TYPE_REGISTER(BF16); + MODEL_DATA_TYPE_REGISTER(INT32); + MODEL_DATA_TYPE_REGISTER(UINT32); + MODEL_DATA_TYPE_REGISTER(INT8); + MODEL_DATA_TYPE_REGISTER(UINT8); + MODEL_DATA_TYPE_REGISTER(BYTE); + } + auto it = instances.find(type_name); + if (it == instances.end()) { + ERR(InvalidUsageError, "Unknown data type: ", type_name); + } + return it->second; +} + +} // namespace ark diff --git a/ark/model/model_data_type.hpp b/ark/model/model_data_type.hpp new file mode 100644 index 000000000..c7127a34a --- /dev/null +++ b/ark/model/model_data_type.hpp @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_DATA_TYPE_HPP_ +#define ARK_MODEL_DATA_TYPE_HPP_ + +#include +#include + +#include "named_type.hpp" + +namespace ark { + +class ModelDataT; +using ModelDataType = std::shared_ptr; + +class ModelDataT : public NamedT { + public: + ModelDataT(const std::string &type_name, const std::string &type_str, + size_t bytes) + : NamedT(type_name), type_str_(type_str), bytes_(bytes) {} + + ModelDataT(const ModelDataT &) = default; + + const std::string &type_str() const { return type_str_; } + + size_t bytes() const { return bytes_; } + + static const ModelDataType from_name(const std::string &type_name); + + private: + std::string type_str_; + size_t bytes_; +}; + +using ModelDataType = std::shared_ptr; + +} // namespace ark + +#endif // ARK_MODEL_DATA_TYPE_HPP_ diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp new file mode 100644 index 000000000..c97d048ea --- /dev/null +++ b/ark/model/model_graph_impl.cpp @@ -0,0 +1,521 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_graph_impl.hpp" + +#include "logging.h" +#include "model_node.hpp" +#include "model_tensor.hpp" + +#define DEBUG_MODEL_GRAPH 0 +#define MODEL_GRAPH_DEBUG(...) \ + do { \ + if (DEBUG_MODEL_GRAPH) { \ + LOG(DEBUG, __VA_ARGS__); \ + } \ + } while (0); + +namespace ark { + +ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; } + +ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) { + std::map node_map; + nodes_.clear(); + for (const auto &node : other.nodes_) { + ModelNodeRef new_node = std::make_shared(); + new_node->ops = node->ops; + node_map.emplace(node, new_node); + nodes_.push_back(new_node); + } + for (const auto &node : other.nodes_) { + auto it = node_map.find(node); + if (it == node_map.end()) { + ERR(ModelError, "unexpected error"); + } + ModelNodeRef new_node = it->second; + for (auto &producer : node->producers) { + auto it2 = node_map.find(producer); + if (it2 == node_map.end()) { + ERR(ModelError, "unexpected error"); + } + new_node->producers.push_back(it2->second); + } + for (auto &consumer : node->consumers) { + auto it2 = node_map.find(consumer); + if (it2 == node_map.end()) { + ERR(ModelError, "unexpected error"); + } + new_node->consumers.push_back(it2->second); + } + } + op_to_node_.clear(); + for (const auto &p : other.op_to_node_) { + auto it = node_map.find(p.second); + if (it == node_map.end()) { + ERR(ModelError, "unexpected error"); + } + op_to_node_[p.first] = it->second; + } + tensor_to_producer_op_ = other.tensor_to_producer_op_; + return *this; +} + +ModelNodeRef ModelGraph::Impl::break_node(ModelNodeRef node, size_t op_idx) { + if (op_idx == 0) { + return node; + } + if (op_idx >= node->ops.size()) { + ERR(ModelError, "unexpected error: op_idx out of range"); + } + ModelNodeRef new_node = std::make_shared(); + nodes_.push_back(new_node); + new_node->ops.insert(new_node->ops.end(), node->ops.begin() + op_idx, + node->ops.end()); + for (auto &op : new_node->ops) { + op_to_node_[op] = new_node; + } + new_node->consumers = node->consumers; + new_node->producers.push_back(node); + for (auto &consumer : node->consumers) { + consumer->producers.erase(node); + consumer->producers.push_back(new_node); + } + node->ops.erase(node->ops.begin() + op_idx, node->ops.end()); + node->consumers.clear(); + node->consumers.push_back(new_node); + return new_node; +} + +void ModelGraph::Impl::compress_nodes() { + this->recursive_remove_virtual_nodes(); + this->recursive_merge_nodes(); +} + +bool ModelGraph::Impl::verify() const { + for (auto &node : nodes_) { + if (node->ops.size() == 0) { + LOG(DEBUG, "node has no ops"); + return false; + } + for (auto &op : node->ops) { + if (op_to_node_.find(op) == op_to_node_.end()) { + LOG(DEBUG, "op has not been added to the graph"); + return false; + } + if (op_to_node_.at(op) != node) { + LOG(DEBUG, "op is not in the correct node"); + return false; + } + op->verify(); + for (auto &tns : op->result_tensors()) { + if (tensor_to_producer_op_.find(tns) == + tensor_to_producer_op_.end()) { + LOG(DEBUG, "result tensor has not been produced by any op"); + return false; + } + if (tensor_to_producer_op_.at(tns) != op) { + LOG(DEBUG, "result tensor has been produced by another op"); + return false; + } + } + for (auto &tns : op->input_tensors()) { + if (tensor_to_producer_op_.find(tns) == + tensor_to_producer_op_.end()) { + LOG(DEBUG, "input tensor has not been produced by any op"); + return false; + } + } + } + for (auto &producer : node->producers) { + if (producer->consumers.find(node) == producer->consumers.end()) { + LOG(DEBUG, "producer does not have this node as consumer"); + return false; + } + } + for (auto &consumer : node->consumers) { + if (consumer->producers.find(node) == consumer->producers.end()) { + LOG(DEBUG, "consumer does not have this node as producer"); + return false; + } + } + } + return true; +} + +ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) { + for (auto &tns : op->input_tensors()) { + if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) { + // This tensor has not been produced by any op - assume it is a + // Tensor op. + ModelOpRef tensor_op = std::make_shared("Tensor", true); + tensor_op->result_tensors_ = {tns}; + this->add_op(tensor_op); + } + } + for (auto &tns : op->result_tensors()) { + if (tensor_to_producer_op_.find(tns) != tensor_to_producer_op_.end()) { + ERR(ModelError, "Tensor has already been produced by an op. ", + tns->serialize().dump(), "; ", + tensor_to_producer_op_.at(tns)->serialize().dump()); + } + tensor_to_producer_op_.emplace(tns, op); + } + + ModelNodeRef node = std::make_shared(); + node->ops.push_back(op); + op_to_node_[op] = node; + + for (auto &tns : op->input_tensors()) { + auto it = tensor_to_producer_op_.find(tns); + if (it == tensor_to_producer_op_.end()) { + ERR(ModelError, "Tensor has not been produced by any op. ", + tns->serialize().dump(), " ", tns.get()); + } + auto it2 = op_to_node_.find(it->second); + if (it2 == op_to_node_.end()) { + ERR(ModelError, "Op has not been added to the graph"); + } + auto producer = it2->second; + node->producers.push_back(producer); + producer->consumers.push_back(node); + } + + nodes_.push_back(node); + return node; +} + +void ModelGraph::Impl::remove_node(ModelNodeRef node) { + auto it = nodes_.find(node); + if (it == nodes_.end()) { + ERR(ModelError, "attempted to remove a node that is not in the graph"); + } + // Remove node from consumers and producers. + for (auto &consumer : node->consumers) { + consumer->producers.erase(node); + } + for (auto &producer : node->producers) { + producer->consumers.erase(node); + } + // Connect consumers and producers. + for (auto &consumer : node->consumers) { + for (auto &producer : node->producers) { + consumer->producers.push_back(producer); + producer->consumers.push_back(consumer); + } + } + for (auto &op : node->ops) { + auto it = op_to_node_.find(op); + if (it == op_to_node_.end()) { + ERR(ModelError, "unexpected error"); + } + if (it->second == node) { + op_to_node_.erase(it); + } + } + nodes_.erase(it); +} + +bool ModelGraph::Impl::depends_on(ModelNodeRef node1, + ModelNodeRef node2) const { + if (node1 == node2) { + return false; + } + std::set seen_nodes; + std::vector boundary_nodes; + boundary_nodes.emplace_back(node1); + while (boundary_nodes.size() > 0) { + std::vector new_boundary_nodes; + for (auto &boundary_node : boundary_nodes) { + if (boundary_node == node2) { + return true; + } + for (auto &producer : boundary_node->producers) { + if (seen_nodes.find(producer) != seen_nodes.end()) { + continue; + } + new_boundary_nodes.emplace_back(producer); + } + } + boundary_nodes = new_boundary_nodes; + } + return false; +} + +void ModelGraph::Impl::recursive_remove_virtual_nodes() { + std::vector leaf_nodes; + for (auto &node : nodes_) { + if (node->consumers.empty()) { + leaf_nodes.emplace_back(node); + } + } + UniqueList seen_nodes; + this->recursive_remove_virtual_nodes(seen_nodes, leaf_nodes); +} + +void ModelGraph::Impl::recursive_remove_virtual_nodes( + UniqueList &seen_nodes, + const std::vector &boundary_nodes) { + if (boundary_nodes.size() == 0) { + return; + } + MODEL_GRAPH_DEBUG("remove virtual nodes"); + std::vector new_boundary_nodes; + for (auto &boundary_node : boundary_nodes) { + if (boundary_node->ops.size() == 0) { + ERR(ModelError, "unexpected error: empty node"); + } else if (boundary_node->ops.size() > 1) { + ERR(ModelError, "unexpected error: multiple ops in node"); + } + MODEL_GRAPH_DEBUG(" boundary node"); + MODEL_GRAPH_DEBUG(" node: ", to_json(boundary_node).dump()); + for (auto &producer : boundary_node->producers) { + // Exception: if any consumer of the producer (rather than the + // current boundary_node) is unseen, we should not add the producer + // to the next boundary. + bool should_add = true; + for (auto &consumer : producer->consumers) { + if (consumer == boundary_node) { + continue; + } + if (!seen_nodes.contains(consumer)) { + should_add = false; + break; + } + } + if (!should_add) { + continue; + } + if (seen_nodes.contains(producer)) { + ERR(ModelError, + "circular dependency detected: ", to_json(producer).dump()); + } + MODEL_GRAPH_DEBUG(" added to next boundary: ", + to_json(producer).dump()); + new_boundary_nodes.emplace_back(producer); + } + if (boundary_node->ops[0]->is_virtual()) { + MODEL_GRAPH_DEBUG(" remove node: ", + to_json(boundary_node).dump()); + // Remove this node from the graph. + this->remove_node(boundary_node); + MODEL_GRAPH_DEBUG(" nodes.size() ", nodes_.size()); + } else { + seen_nodes.push_back(boundary_node); + } + } + this->recursive_remove_virtual_nodes(seen_nodes, new_boundary_nodes); +} + +void ModelGraph::Impl::recursive_merge_nodes() { + std::vector leaf_nodes; + for (auto &node : nodes_) { + if (node->consumers.empty()) { + leaf_nodes.emplace_back(node); + } + } + UniqueList seen_nodes; + this->recursive_merge_nodes(seen_nodes, leaf_nodes); +} + +void ModelGraph::Impl::recursive_merge_nodes( + UniqueList &seen_nodes, + const std::vector &boundary_nodes) { + if (boundary_nodes.size() == 0) { + return; + } + MODEL_GRAPH_DEBUG("merge ops"); + std::vector new_boundary_nodes; + for (auto &boundary_node : boundary_nodes) { + MODEL_GRAPH_DEBUG(" boundary node"); + MODEL_GRAPH_DEBUG(" node: ", to_json(boundary_node).dump()); + if (boundary_node->producers.size() == 0) { + // This node is a root. + seen_nodes.push_back(boundary_node); + MODEL_GRAPH_DEBUG(" root"); + continue; + } + // Add all producers of this node to the next boundary. + for (auto &producer : boundary_node->producers) { + // Exception: if any consumer of the producer (rather than the + // current boundary_node) is unseen, we should not add the producer + // to the next boundary. + bool should_add = true; + for (auto &consumer : producer->consumers) { + if (consumer == boundary_node) { + continue; + } + if (!seen_nodes.contains(consumer)) { + should_add = false; + break; + } + } + if (!should_add) { + continue; + } + if (seen_nodes.contains(producer)) { + ERR(ModelError, + "unexpected error: circular dependency detected"); + } + new_boundary_nodes.emplace_back(producer); + } + ModelNodeRef merge_candidate; + if (boundary_node->producers.size() > 1) { + // This node has multiple producers. We can merge only if one + // producer depends on all other producers. + for (auto &producer : boundary_node->producers) { + bool depends_on_all = true; + for (auto &other_producer : boundary_node->producers) { + if (other_producer == producer) { + continue; + } + if (!this->depends_on(producer, other_producer)) { + depends_on_all = false; + break; + } + } + if (depends_on_all) { + merge_candidate = producer; + break; + } + } + if (!merge_candidate) { + // At least one producer does not depend on others. + // Cannot merge. + seen_nodes.push_back(boundary_node); + MODEL_GRAPH_DEBUG(" multiple producers"); + continue; + } + } else { + // This node has only one producer. + merge_candidate = *(boundary_node->producers.begin()); + } + if (merge_candidate->consumers.size() == 0) { + ERR(ModelError, "unexpected error: graph is incomplete"); + } + if (merge_candidate->consumers.size() > 1) { + // The candidate has multiple consumers. We can merge only if all + // other consumers depend on the current boundary_node. + bool depends_on_one = true; + for (auto &consumer : merge_candidate->consumers) { + if (consumer == boundary_node) { + continue; + } + if (!this->depends_on(consumer, boundary_node)) { + depends_on_one = false; + break; + } + } + if (!depends_on_one) { + // At least one consumer does not depend on the boundary_node. + // Cannot merge. + seen_nodes.push_back(boundary_node); + MODEL_GRAPH_DEBUG(" multiple consumers"); + continue; + } + } + // We can merge the two nodes. + // Merge `boundary_node` into `merge_candidate`. + MODEL_GRAPH_DEBUG(" merge: ", to_json(merge_candidate).dump(), " -> ", + to_json(boundary_node).dump()); + auto &ops = boundary_node->ops; + merge_candidate->ops.insert(merge_candidate->ops.end(), ops.begin(), + ops.end()); + for (auto &op : ops) { + op_to_node_[op] = merge_candidate; + } + for (auto &consumer : boundary_node->consumers) { + consumer->producers.erase(boundary_node); + consumer->producers.push_back(merge_candidate); + merge_candidate->consumers.push_back(consumer); + } + for (auto &producer : boundary_node->producers) { + if (producer == merge_candidate) { + continue; + } + producer->consumers.erase(boundary_node); + producer->consumers.push_back(merge_candidate); + merge_candidate->producers.push_back(producer); + } + merge_candidate->consumers.erase(boundary_node); + + // Remove `boundary_node` from `nodes_`. + auto it = nodes_.find(boundary_node); + if (it == nodes_.end()) { + ERR(ModelError, "unexpected error"); + } + nodes_.erase(it); + + // Since producer is already in the next boundary and boundary_node is + // merged into producer, we don't need to add anything to + // seen_nodes here. + } + this->recursive_merge_nodes(seen_nodes, new_boundary_nodes); +} + +nlohmann::ordered_json ModelGraph::Impl::to_json( + const ModelNodeRef &node) const { + nlohmann::ordered_json j; + j["Id"] = nodes_.index(node); + j["ProducerNodeIds"] = nlohmann::json::array(); + for (auto producer : node->producers) { + j["ProducerNodeIds"].emplace_back(nodes_.index(producer)); + } + j["ConsumerNodeIds"] = nlohmann::json::array(); + for (auto consumer : node->consumers) { + j["ConsumerNodeIds"].emplace_back(nodes_.index(consumer)); + } + j["Ops"] = nlohmann::json::array(); + for (auto op : node->ops) { + j["Ops"].emplace_back(op->serialize()); + } + return j; +} + +std::string ModelGraph::Impl::serialize(int indent) const { + nlohmann::ordered_json j; + j["Nodes"] = nlohmann::json::array(); + for (const auto &node : nodes_) { + j["Nodes"].emplace_back(this->to_json(node)); + } + j["Tensors"] = nlohmann::json::array(); + for (const auto &tensor_and_op : tensor_to_producer_op_) { + j["Tensors"].emplace_back(tensor_and_op.first->serialize()); + } + return j.dump(indent); +} + +std::vector ModelGraph::Impl::nodes() const { + std::vector vec; + vec.insert(vec.end(), nodes_.begin(), nodes_.end()); + return vec; +} + +ModelGraph::ModelGraph() : impl_(std::make_unique()) {} + +ModelGraph::ModelGraph(const ModelGraph &other) + : impl_(std::make_unique(*other.impl_)) {} + +ModelGraph::~ModelGraph() = default; + +ModelGraph &ModelGraph::operator=(const ModelGraph &other) { + *impl_ = *other.impl_; + return *this; +} + +ModelNodeRef ModelGraph::break_node(ModelNodeRef node, size_t op_idx) { + return impl_->break_node(node, op_idx); +} + +/// Get the list of @ref ModelNode in the graph. +std::vector ModelGraph::nodes() const { return impl_->nodes(); } + +std::string ModelGraph::serialize(int indent) const { + return impl_->serialize(indent); +} + +void ModelGraph::compress_nodes() { impl_->compress_nodes(); } + +bool ModelGraph::verify() const { return impl_->verify(); } + +} // namespace ark diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp new file mode 100644 index 000000000..0f557ce39 --- /dev/null +++ b/ark/model/model_graph_impl.hpp @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_GRAPH_IMPL_HPP_ +#define ARK_MODEL_GRAPH_IMPL_HPP_ + +#include +#include +#include +#include + +#include "ark/dims.hpp" +#include "ark/model_graph.hpp" +#include "model_op.hpp" +#include "nlohmann/json.hpp" +#include "unique_list.hpp" + +namespace ark { + +class ModelGraph::Impl { + public: + Impl(){}; + + Impl(const Impl &other); + + Impl &operator=(const Impl &other); + + template + ModelOpRef create_op(const std::string &name, Args &&...args) { + ModelOpRef op = std::make_shared(std::forward(args)...); + std::string name_copy; + if (name.empty()) { + name_copy = op->type()->type_name(); + } else { + name_copy = name; + } + size_t count = op_names_.count(name_copy); + if (count > 0) { + name_copy += "_" + std::to_string(count); + } + op_names_.insert(name_copy); + op->set_name(name_copy); + add_op(op); + return op; + } + + ModelNodeRef break_node(ModelNodeRef node, size_t op_idx); + + void compress_nodes(); + + bool verify() const; + + std::string serialize(int indent) const; + + std::vector nodes() const; + + private: + ModelNodeRef add_op(ModelOpRef op); + + void remove_node(ModelNodeRef node); + + bool depends_on(ModelNodeRef node1, ModelNodeRef node2) const; + + void recursive_remove_virtual_nodes(); + + void recursive_remove_virtual_nodes( + UniqueList &seen_nodes, + const std::vector &boundary_nodes); + + void recursive_merge_nodes(); + + void recursive_merge_nodes(UniqueList &seen_nodes, + const std::vector &boundary_nodes); + + nlohmann::ordered_json to_json(const ModelNodeRef &node) const; + + /// The list of @ref ModelNode in the graph. + UniqueList nodes_; + + /// The set of used names of @ref ModelOp. + std::multiset op_names_; + + /// The mapping from @ref ModelTensor to the @ref ModelOp that produces it. + std::map tensor_to_producer_op_; + + /// The mapping from @ref ModelOp to the @ref ModelNode that contains it. + std::map op_to_node_; +}; + +} // namespace ark + +#endif // ARK_MODEL_GRAPH_IMPL_HPP_ diff --git a/ark/model/model_node.cpp b/ark/model/model_node.cpp new file mode 100644 index 000000000..ef3a8158a --- /dev/null +++ b/ark/model/model_node.cpp @@ -0,0 +1,6 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_node.hpp" + +namespace ark {} // namespace ark diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp new file mode 100644 index 000000000..7838ca120 --- /dev/null +++ b/ark/model/model_node.hpp @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_NODE_HPP_ +#define ARK_MODEL_NODE_HPP_ + +#include +#include + +#include "ark/model_ref.hpp" +#include "unique_list.hpp" + +namespace ark { + +/// A node of @ref Model. +class ModelNode { + public: + ModelNode() = default; + + /// The list of @ref Op that this @ref ModelNode contains. Sorted in the + /// execution order. + std::vector ops; + + /// The list of @ref ModelNode that depends on this @ref ModelNode. + UniqueList consumers; + + /// The list of @ref ModelNode that this @ref ModelNode depends on. + UniqueList producers; +}; + +} // namespace ark + +#endif // ARK_MODEL_NODE_HPP_ diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp new file mode 100644 index 000000000..c34ddadd5 --- /dev/null +++ b/ark/model/model_op.cpp @@ -0,0 +1,187 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_op.hpp" + +#include +#include + +#include "logging.h" +#include "model_tensor.hpp" +#include "ops/ops_arithmetic.hpp" +#include "ops/ops_math.hpp" +#include "ops/ops_matmul.hpp" +#include "ops/ops_refer.hpp" +#include "ops/ops_scale.hpp" +#include "ops/ops_sendrecv.hpp" +#include "ops/ops_tensor.hpp" + +namespace ark { + +std::shared_ptr model_op_factory() { + static auto factory = std::make_shared(); + return factory; +} + +#define MODEL_OP_TYPE_REGISTER(_name) \ + instances[#_name] = std::make_shared(#_name); \ + model_op_factory()->register_op(#_name); + +const ModelOpType ModelOpT::from_name(const std::string &type_name) { + static std::unordered_map instances; + if (instances.empty()) { + MODEL_OP_TYPE_REGISTER(Add); + MODEL_OP_TYPE_REGISTER(Div); + MODEL_OP_TYPE_REGISTER(Exp); + MODEL_OP_TYPE_REGISTER(Matmul); + MODEL_OP_TYPE_REGISTER(Mul); + MODEL_OP_TYPE_REGISTER(Recv); + MODEL_OP_TYPE_REGISTER(Relu); + MODEL_OP_TYPE_REGISTER(Scale); + MODEL_OP_TYPE_REGISTER(Send); + MODEL_OP_TYPE_REGISTER(SendDone); + MODEL_OP_TYPE_REGISTER(Sub); + MODEL_OP_TYPE_REGISTER(Tensor); + } + auto it = instances.find(type_name); + if (it == instances.end()) { + ERR(InvalidUsageError, "Unknown model op type: ", type_name); + } + return it->second; +} + +std::vector ModelOp::input_tensors() const { + // input_tensors = read_tensors || write_tensors + std::set input_tensors; + input_tensors.insert(read_tensors_.begin(), read_tensors_.end()); + input_tensors.insert(write_tensors_.begin(), write_tensors_.end()); + std::vector input_tensors_vec(input_tensors.begin(), + input_tensors.end()); + return input_tensors_vec; +} + +void ModelOp::verify() const { + std::set inputs; + inputs.insert(read_tensors_.begin(), read_tensors_.end()); + inputs.insert(write_tensors_.begin(), write_tensors_.end()); + + for (auto &input : inputs) { + if (input->buffer() == nullptr) { + ERR(InvalidUsageError, "input tensor buffer is null"); + } + } + + std::set outputs; + outputs.insert(result_tensors_.begin(), result_tensors_.end()); + + for (auto &output : outputs) { + if (output->buffer() == nullptr) { + ERR(InvalidUsageError, "output tensor buffer is null"); + } + } + + std::set intersect; + std::set_intersection(inputs.begin(), inputs.end(), outputs.begin(), + outputs.end(), + std::inserter(intersect, intersect.begin())); + if (!intersect.empty()) { + ERR(InvalidUsageError, "cyclic dependency detected"); + } +} + +std::string ModelOp::vec_string(const Dims &dims) { + if (dims.is_invalid()) { + ERR(InvalidUsageError, "invalid dims given"); + } + int ndims = dims.ndims(); + std::stringstream ss; + ss << "Vec<"; + if (ndims > 0) { + ss << dims[0]; + for (int i = 1; i < ndims; ++i) { + ss << ", " << dims[i]; + } + } + ss << '>'; + return ss.str(); +} + +std::string ModelOp::function_name_string( + const std::string &kernel_name, + const std::vector &template_args) { + std::stringstream ss; + ss << kernel_name; + if (!template_args.empty()) { + ss << "<" << template_args[0]; + for (size_t i = 1; i < template_args.size(); i++) { + ss << ", " << template_args[i]; + } + ss << ">"; + } + return ss.str(); +} + +nlohmann::ordered_json ModelOp::serialize() const { + nlohmann::ordered_json j; + j["Type"] = type_->type_name(); + j["Name"] = name_; + j["IsVirtual"] = is_virtual_; + j["ReadTensors"] = nlohmann::ordered_json::array(); + for (auto &t : read_tensors_) { + j["ReadTensors"].push_back(t->serialize()); + } + j["WriteTensors"] = nlohmann::ordered_json::array(); + for (auto &t : write_tensors_) { + j["WriteTensors"].push_back(t->serialize()); + } + j["ResultTensors"] = nlohmann::ordered_json::array(); + for (auto &t : result_tensors_) { + j["ResultTensors"].push_back(t->serialize()); + } + j["Args"] = nlohmann::ordered_json::object(); + for (auto &arg : args_) { + j["Args"][arg.first] = arg.second.serialize(); + } + return j; +} + +std::shared_ptr ModelOp::deserialize( + const nlohmann::json &serialized) { + if (!serialized.contains("Type")) { + ERR(InvalidUsageError, "ModelOp deserialization failed: missing Type"); + } else if (!serialized.contains("Name")) { + ERR(InvalidUsageError, "ModelOp deserialization failed: missing Name"); + } else if (!serialized.contains("IsVirtual")) { + ERR(InvalidUsageError, + "ModelOp deserialization failed: missing IsVirtual"); + } else if (!serialized.contains("ReadTensors")) { + ERR(InvalidUsageError, + "ModelOp deserialization failed: missing ReadTensors"); + } else if (!serialized.contains("WriteTensors")) { + ERR(InvalidUsageError, + "ModelOp deserialization failed: missing WriteTensors"); + } else if (!serialized.contains("ResultTensors")) { + ERR(InvalidUsageError, + "ModelOp deserialization failed: missing ResultTensors"); + } else if (!serialized.contains("Args")) { + ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args"); + } + auto ret = model_op_factory()->construct(serialized["Type"]); + ret->name_ = serialized["Name"]; + ret->is_virtual_ = serialized["IsVirtual"]; + for (const auto &t : serialized["ReadTensors"]) { + ret->read_tensors_.push_back(ModelTensor::deserialize(t)); + } + for (const auto &t : serialized["WriteTensors"]) { + ret->write_tensors_.push_back(ModelTensor::deserialize(t)); + } + for (const auto &t : serialized["ResultTensors"]) { + ret->result_tensors_.push_back(ModelTensor::deserialize(t)); + } + for (const auto &arg : serialized["Args"].items()) { + ret->args_[arg.key()] = ModelOpArg::deserialize(arg.value()); + } + return ret; +} + +} // namespace ark diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp new file mode 100644 index 000000000..e6237361a --- /dev/null +++ b/ark/model/model_op.hpp @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_OP_HPP_ +#define ARK_MODEL_OP_HPP_ + +#include +#include +#include +#include +#include +#include + +#include "ark/model_ref.hpp" +#include "logging.h" +#include "model_op_arg.hpp" +#include "model_op_config.hpp" +#include "nlohmann/json.hpp" + +namespace ark { + +class ModelGraph; + +class ModelOpT; +using ModelOpType = std::shared_ptr; + +class ModelOp; + +class ModelOpT : public NamedT { + public: + ModelOpT(const std::string &type_name) : NamedT(type_name) {} + + ModelOpT(const ModelOpT &) = default; + + static const ModelOpType from_name(const std::string &type_name); +}; + +class ModelOp { + public: + ModelOp() = default; + + ModelOp(const std::string &type_name, bool is_virtual = false) + : type_(ModelOpT::from_name(type_name)), is_virtual_(is_virtual) {} + + ModelOp(const ModelOp &) = default; + + virtual std::string impl_name( + [[maybe_unused]] const nlohmann::json &config) const { + return ""; + } + + virtual std::vector impl_args( + [[maybe_unused]] const nlohmann::json &config) const { + return {}; + } + + void set_name(const std::string &name) { name_ = name; } + + ModelOpType type() const { return type_; } + + const std::string &name() const { return name_; } + + bool is_virtual() const { return is_virtual_; } + + const std::vector &read_tensors() const { + return read_tensors_; + } + + const std::vector &write_tensors() const { + return write_tensors_; + } + + const std::vector &result_tensors() const { + return result_tensors_; + } + + const std::map &args() const { return args_; } + + std::vector input_tensors() const; + + void verify() const; + + nlohmann::ordered_json serialize() const; + + static std::shared_ptr deserialize( + const nlohmann::json &serialized); + + protected: + friend class ModelGraph; + + static std::string vec_string(const Dims &dims); + + static std::string function_name_string( + const std::string &kernel_name, + const std::vector &template_args); + + ModelOpType type_; + std::string name_; + bool is_virtual_; + std::vector read_tensors_; + std::vector write_tensors_; + std::vector result_tensors_; + std::map args_; +}; + +class ModelOpFactory { + private: + std::unordered_map()>> + constructors_; + + public: + ModelOpFactory() = default; + + template + void register_op(const std::string &class_name) { + if (constructors_.find(class_name) != constructors_.end()) { + ERR(InvalidUsageError, "Class already registered: ", class_name); + } + constructors_[class_name] = []() { + return std::shared_ptr(new DerivedModelOp()); + }; + } + + std::shared_ptr construct(const std::string &class_name) const { + auto it = constructors_.find(class_name); + if (it == constructors_.end()) { + ERR(InvalidUsageError, + "Tried to construct an unknown class: ", class_name); + } + return it->second(); + } + + bool empty() const { return constructors_.empty(); } +}; + +std::shared_ptr model_op_factory(); + +} // namespace ark + +#endif // ARK_MODEL_OP_HPP_ diff --git a/ark/model/model_op_arg.cpp b/ark/model/model_op_arg.cpp new file mode 100644 index 000000000..29b76d07c --- /dev/null +++ b/ark/model/model_op_arg.cpp @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_op_arg.hpp" + +#include "logging.h" +#include "model_tensor.hpp" + +namespace ark { + +ModelOpArg::ModelOpArg() : NamedT("") {} + +nlohmann::ordered_json ModelOpArg::serialize() const { + const std::string &type_name = this->type_name(); + nlohmann::ordered_json j; + if (type_name == "TENSOR") { + j[type_name] = this->value()->serialize(); + } else if (type_name == "DIMS") { + j[type_name] = this->value().vector(); + } else if (type_name == "INT") { + j[type_name] = this->value(); + } else if (type_name == "INT64") { + j[type_name] = this->value(); + } else if (type_name == "UINT64") { + j[type_name] = this->value(); + } else if (type_name == "BOOL") { + j[type_name] = this->value(); + } else if (type_name == "FLOAT") { + j[type_name] = this->value(); + } else { + ERR(InvalidUsageError, + "Tried to serialize an unknown type of argument: ", type_name); + } + return j; +} + +ModelOpArg ModelOpArg::deserialize(const nlohmann::json &serialized) { + const std::string &type_name = serialized[0]; + auto &value = serialized[1]; + if (type_name == "TENSOR") { + return ModelOpArg(ModelTensor::deserialize(value)); + } else if (type_name == "DIMS") { + return ModelOpArg(Dims(value.get>())); + } else if (type_name == "INT") { + return ModelOpArg(value.get()); + } else if (type_name == "INT64") { + return ModelOpArg(value.get()); + } else if (type_name == "UINT64") { + return ModelOpArg(value.get()); + } else if (type_name == "BOOL") { + return ModelOpArg(value.get()); + } else if (type_name == "FLOAT") { + return ModelOpArg(value.get()); + } + ERR(InvalidUsageError, + "Tried to deserialize an unknown type of argument: ", type_name); + return ModelOpArg(); +} + +} // namespace ark diff --git a/ark/model/model_op_arg.hpp b/ark/model/model_op_arg.hpp new file mode 100644 index 000000000..faf628ceb --- /dev/null +++ b/ark/model/model_op_arg.hpp @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_OP_ARG_HPP_ +#define ARK_MODEL_OP_ARG_HPP_ + +#include +#include +#include + +#include "ark/dims.hpp" +#include "ark/model_ref.hpp" +#include "named_type.hpp" +#include "nlohmann/json.hpp" + +namespace ark { + +template +class ModelOpArgTName; + +#define REGISTER_MODEL_OP_ARG_TYPE(_name, _type) \ + template <> \ + class ModelOpArgTName<_type> { \ + public: \ + ModelOpArgTName() : name(#_name), type_str(#_type){}; \ + const std::string name; \ + const std::string type_str; \ + }; + +class ModelOpArg : public NamedT { + public: + ModelOpArg(); + + template + ModelOpArg(T val) + : NamedT(ModelOpArgTName().name), + type_str_(ModelOpArgTName().type_str), + val_(val) {} + + template + T value() const { + return std::any_cast(val_); + } + + const std::string &type_str() const { return type_str_; } + + nlohmann::ordered_json serialize() const; + + static ModelOpArg deserialize(const nlohmann::json &serialized); + + private: + std::string type_str_; + std::any val_; +}; + +REGISTER_MODEL_OP_ARG_TYPE(INT, int) +REGISTER_MODEL_OP_ARG_TYPE(INT64, int64_t) +REGISTER_MODEL_OP_ARG_TYPE(UINT64, uint64_t) +REGISTER_MODEL_OP_ARG_TYPE(BOOL, bool) +REGISTER_MODEL_OP_ARG_TYPE(FLOAT, float) +REGISTER_MODEL_OP_ARG_TYPE(DIMS, Dims) +REGISTER_MODEL_OP_ARG_TYPE(TENSOR, ModelTensorRef) + +} // namespace ark + +#endif // ARK_MODEL_OP_ARG_HPP_ diff --git a/ark/model/model_op_config.cpp b/ark/model/model_op_config.cpp new file mode 100644 index 000000000..56cd7395f --- /dev/null +++ b/ark/model/model_op_config.cpp @@ -0,0 +1,69 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_op_config.hpp" + +namespace ark { + +ModelOpConfigArchT::ModelOpConfigArchT() : NamedT("ANY"){}; + +ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0) + : NamedT(c0), category_({c0}) {} + +ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0, + const std::string &c1) + : NamedT(c0 + "_" + c1), category_({c0, c1}) {} + +ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0, + const std::string &c1, + const std::string &c2) + : NamedT(c0 + "_" + c1 + "_" + c2), category_({c0, c1, c2}) {} + +bool ModelOpConfigArchT::belongs_to( + const std::shared_ptr arch) const { + if (category_.size() <= arch->category().size()) { + return false; + } + size_t idx = 0; + for (const auto &name : arch->category()) { + if (category_[idx++] != name) { + return false; + } + } + return true; +} + +bool ModelOpConfigArchT::later_than( + const std::shared_ptr arch) const { + if (category_.size() != arch->category().size()) { + return false; + } + size_t idx = 0; + for (const auto &name : arch->category()) { + if (category_[idx] != name) { + return category_[idx] > name; + } + } + return true; +} + +extern const ModelOpConfigArchType ARCH_ANY = + std::make_shared(); + +extern const ModelOpConfigArchType ARCH_CUDA = + std::make_shared("CUDA"); +extern const ModelOpConfigArchType ARCH_CUDA_70 = + std::make_shared("CUDA", "70"); +extern const ModelOpConfigArchType ARCH_CUDA_80 = + std::make_shared("CUDA", "80"); +extern const ModelOpConfigArchType ARCH_CUDA_90 = + std::make_shared("CUDA", "90"); + +extern const ModelOpConfigArchType ARCH_ROCM = + std::make_shared("ROCM"); +extern const ModelOpConfigArchType ARCH_ROCM_90A = + std::make_shared("ROCM", "90A"); +extern const ModelOpConfigArchType ARCH_ROCM_942 = + std::make_shared("ROCM", "942"); + +} // namespace ark diff --git a/ark/model/model_op_config.hpp b/ark/model/model_op_config.hpp new file mode 100644 index 000000000..11e1a17d7 --- /dev/null +++ b/ark/model/model_op_config.hpp @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_OP_CONFIG_HPP_ +#define ARK_MODEL_OP_CONFIG_HPP_ + +#include +#include +#include + +#include "named_type.hpp" + +namespace ark { + +class ModelOpConfigArchT : public NamedT { + public: + ModelOpConfigArchT(); + + ModelOpConfigArchT(const std::string &c0); + + ModelOpConfigArchT(const std::string &c0, const std::string &c1); + + ModelOpConfigArchT(const std::string &c0, const std::string &c1, + const std::string &c2); + + ModelOpConfigArchT(const ModelOpConfigArchT &) = default; + + const std::vector &category() const { return category_; } + + bool belongs_to(const std::shared_ptr arch) const; + + bool later_than(const std::shared_ptr arch) const; + + private: + std::vector category_; +}; + +using ModelOpConfigArchType = std::shared_ptr; + +extern const ModelOpConfigArchType ARCH_ANY; + +extern const ModelOpConfigArchType ARCH_CUDA; +extern const ModelOpConfigArchType ARCH_CUDA_70; +extern const ModelOpConfigArchType ARCH_CUDA_80; +extern const ModelOpConfigArchType ARCH_CUDA_90; + +extern const ModelOpConfigArchType ARCH_ROCM; +extern const ModelOpConfigArchType ARCH_ROCM_90A; +extern const ModelOpConfigArchType ARCH_ROCM_942; + +class ModelOpConfig { + public: + ModelOpConfig(const ModelOpConfigArchType arch, const std::string &name, + const std::string &impl_name) + : arch_(arch), name_(name), impl_name_(impl_name) {} + + ModelOpConfig(const ModelOpConfig &) = default; + + const ModelOpConfigArchType arch() const { return arch_; } + + const std::string &name() const { return name_; } + + const std::string &impl_name() const { return impl_name_; } + + private: + ModelOpConfigArchType arch_; + std::string name_; + std::string impl_name_; +}; + +} // namespace ark + +#endif // ARK_MODEL_OP_CONFIG_HPP_ diff --git a/ark/model/model_tensor.cpp b/ark/model/model_tensor.cpp new file mode 100644 index 000000000..04e575ffd --- /dev/null +++ b/ark/model/model_tensor.cpp @@ -0,0 +1,176 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "model_tensor.hpp" + +#include "logging.h" + +namespace ark { + +ModelBuffer::ModelBuffer() { + static size_t id = 0; + id_ = id++; +} + +ModelBuffer::ModelBuffer(size_t id) { id_ = id; } + +ModelTensor::ModelTensor(ModelDataType data_type, ModelBufferRef buffer, + const Dims &shape, const Dims &strides, + const Dims &offsets, const Dims &pads, bool exported, + int imported_rank) + : data_type_(data_type), + buffer_(buffer), + exported_(exported), + imported_rank_(imported_rank) { + if (shape.size() == 0) { + ERR(InvalidUsageError, + "Tensor shape should consist of positive numbers. Given: ", shape); + } else if (shape.is_no_dim()) { + // Assume a single-element constant + shape_ = {1}; + } else { + shape_ = shape; + } + int ndims = shape_.ndims(); + if (strides.is_no_dim()) { + strides_ = shape_; + } else { + if (ndims != strides.ndims()) { + ERR(InvalidUsageError, + "Tensor shapes and strides should have the same number of " + "dimensions. Given: shape ", + shape_, " strides ", strides); + } + strides_ = strides; + } + if (offsets.is_no_dim()) { + std::vector dims_vec; + for (int i = 0; i < ndims; ++i) { + dims_vec.push_back(0); + } + offsets_ = Dims{dims_vec}; + } else { + if (ndims != offsets.ndims()) { + ERR(InvalidUsageError, + "Tensor shape and offs should have the same number of " + "dimensions. Given: shape ", + shape_, " offs ", offsets); + } + offsets_ = offsets; + } + if (pads.is_no_dim()) { + std::vector dims_vec; + for (int i = 0; i < ndims; ++i) { + dims_vec.push_back(1); + } + pads_ = Dims{dims_vec}; + } else { + if (ndims != pads.ndims()) { + ERR(InvalidUsageError, + "Tensor shape and pads should have the same number of " + "dimensions. Given: shape ", + shape_, " pads ", pads); + } + pads_ = pads; + } + for (int i = 0; i < ndims; ++i) { + if (strides_[i] % pads_[i] != 0) { + ERR(InvalidUsageError, + "Tensor strides should be a multiple of pads. strides ", + strides_, " pads ", pads_); + } + } + for (int i = 0; i < ndims; ++i) { + if (offsets_[i] + shape_[i] > strides_[i]) { + ERR(InvalidUsageError, "Tensor exceeds the memory boundary. offs ", + offsets_, " shape ", shape_, " strides ", strides_); + } + } + id_ = next_id(); +} + +ModelTensor::ModelTensor(const ModelTensor &other) { + id_ = next_id(); + data_type_ = other.data_type_; + buffer_ = other.buffer_; + shape_ = other.shape_; + strides_ = other.strides_; + offsets_ = other.offsets_; + pads_ = other.pads_; + exported_ = other.exported_; + imported_rank_ = other.imported_rank_; +} + +bool ModelTensor::is_sequential() const { + // Shape and strides should be the same except for the first dimension. + for (int i = 1; i < shape_.ndims(); ++i) { + if (shape_[i] != strides_[i]) { + return false; + } + } + return true; +} + +nlohmann::ordered_json ModelTensor::serialize() const { + nlohmann::ordered_json j; + j["Id"] = id_; + j["DataType"] = data_type_->type_name(); + j["BufferId"] = buffer_->id(); + j["Shape"] = shape_.vector(); + j["Strides"] = strides_.vector(); + j["Offsets"] = offsets_.vector(); + j["Pads"] = pads_.vector(); + j["Exported"] = exported_; + j["ImportedRank"] = imported_rank_; + return j; +} + +std::shared_ptr ModelTensor::deserialize( + const nlohmann::json &serialized) { + if (!serialized.contains("DataType")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing DataType"); + } else if (!serialized.contains("BufferId")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing BufferId"); + } else if (!serialized.contains("Shape")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Shape"); + } else if (!serialized.contains("Strides")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Strides"); + } else if (!serialized.contains("Offsets")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Offsets"); + } else if (!serialized.contains("Pads")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Pads"); + } else if (!serialized.contains("Exported")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Exported"); + } else if (!serialized.contains("ImportedRank")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing ImportedRank"); + } else if (!serialized.contains("Id")) { + ERR(InvalidUsageError, + "ModelTensor deserialization failed: missing Id"); + } + auto ret = std::make_shared( + ModelDataT::from_name(serialized["DataType"]), + std::make_shared(serialized["BufferId"]), + serialized["Shape"].get>(), + serialized["Strides"].get>(), + serialized["Offsets"].get>(), + serialized["Pads"].get>(), + serialized["Exported"].get(), + serialized["ImportedRank"].get()); + ret->id_ = serialized["Id"]; + return ret; +} + +size_t ModelTensor::next_id() { + static size_t id = 0; + return id++; +} + +} // namespace ark diff --git a/ark/model/model_tensor.hpp b/ark/model/model_tensor.hpp new file mode 100644 index 000000000..e0db15165 --- /dev/null +++ b/ark/model/model_tensor.hpp @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_MODEL_TENSOR_HPP_ +#define ARK_MODEL_TENSOR_HPP_ + +#include "ark/dims.hpp" +#include "ark/model_ref.hpp" +#include "model_data_type.hpp" +#include "nlohmann/json.hpp" + +namespace ark { + +class ModelBuffer { + public: + ModelBuffer(); + + ModelBuffer(size_t id); + + size_t id() const { return id_; } + + private: + size_t id_; +}; + +/// Tensor is a view of a TensorBuf. +/// +/// Illustration of a single axis of a tensor: +/// +/// 0 off stride +/// |------------|-------------shape-------------|---------------------------| +/// ^ <-----------------------------> ^ +/// | data range of this tensor | +/// +------------------------------------------+-----------+ +/// | +/// We call these "padding". +/// +class ModelTensor { + public: + ModelTensor(ModelDataType data_type, ModelBufferRef buffer, + const Dims &shape, const Dims &strides = {}, + const Dims &offsets = {}, const Dims &pads = {}, + bool exported = false, int imported_rank = -1); + + ModelTensor(const ModelTensor &other); + + size_t id() const { return id_; } + + ModelDataType data_type() const { return data_type_; } + + const ModelBufferRef buffer() const { return buffer_; } + + const Dims &shape() const { return shape_; } + + const Dims &strides() const { return strides_; } + + const Dims &offsets() const { return offsets_; } + + const Dims &pads() const { return pads_; } + + bool exported() const { return exported_; } + + int imported_rank() const { return imported_rank_; } + + bool is_sequential() const; + + void set_exported() { exported_ = true; } + + void set_imported_rank(int rank) { imported_rank_ = rank; } + + nlohmann::ordered_json serialize() const; + + static std::shared_ptr deserialize( + const nlohmann::json &serialized); + + private: + static size_t next_id(); + + size_t id_; + ModelDataType data_type_; + ModelBufferRef buffer_; + Dims shape_; + Dims strides_; + Dims offsets_; + Dims pads_; + bool exported_; + int imported_rank_; +}; + +} // namespace ark + +#endif // ARK_MODEL_TENSOR_HPP_ diff --git a/ark/model/model_test.cpp b/ark/model/model_test.cpp new file mode 100644 index 000000000..bebdd5cec --- /dev/null +++ b/ark/model/model_test.cpp @@ -0,0 +1,443 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" + +#include + +#include "logging.h" +#include "model_node.hpp" +#include "model_op.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_model_basics() { + ark::Model model; + ark::Model compressed; + + // Basic Test. + // Model graph: + // + // TensorOp --> t0 --+--> AddOp --> t2 + // | + // TensorOp --> t1 --+ + // | + // TensorOp --> tx --+ (tx is the output reference, hidden from the code) + // + + ark::ModelTensorRef t0 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t1 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t2 = model.add(t0, t1); + + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + UNITTEST_EQ(compressed.nodes().size(), 1); + + auto node = compressed.nodes().front(); + UNITTEST_EQ(node->ops.size(), 1); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1); + UNITTEST_EQ(node->consumers.size(), 0); + UNITTEST_EQ(node->producers.size(), 0); + + // Test a chain of Ops that share an input tensor. + // Model graph: + // + // TensorOp --> t0 --+--> AddOp --> t2 ------+--> AddOp --> t3 + // | | + // TensorOp --> t1 --+-----------------------+ + // | | + // TensorOp --> tx --+ TensorOp --> ty --+ + // + // (tx and ty are output references, hidden from the code) + // + + ark::ModelTensorRef t3 = model.add(t2, t1); + + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + UNITTEST_EQ(compressed.nodes().size(), 1); + + node = compressed.nodes().front(); + + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[1]->read_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->read_tensors()[1], t1); + UNITTEST_EQ(node->consumers.size(), 0); + UNITTEST_EQ(node->producers.size(), 0); + + // Test a chain of Ops without shared input tensors. + // Model graph (omit leftmost part): + // + // ... ----+--> AddOp --> t3 ----+-> ReluOp --> t4 + // ... | | + // ... ----+ TensorOp --> tz --+ + // ... | + // ... --+ (tz is the output reference, hidden from the code) + // + + ark::ModelTensorRef t4 = model.relu(t3); + + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,ReluOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + UNITTEST_EQ(compressed.nodes().size(), 1); + + node = compressed.nodes().front(); + + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[1]->read_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->read_tensors()[1], t1); + UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4); + UNITTEST_EQ(node->ops[2]->read_tensors()[0], t3); + UNITTEST_EQ(node->consumers.size(), 0); + UNITTEST_EQ(node->producers.size(), 0); + + // Test a chain of Ops that use the output from the same previous Op. + // Model graph (omit leftmost part): + // + // ... +---- (this is t2) -------------------------+--> AddOp --> t5 + // ... | | + // ... --+-+--> AddOp --> t3 ----+-> ReluOp --> t4 --+ + // ... | | | + // ... ----+ TensorOp --> tz --+ | + // ... | TensorOp --> tw --+ + // ... --+ + // + // (tz and tw are output references, hidden from the code) + // + + ark::ModelTensorRef t5 = model.add(t2, t4); + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,ReluOp,AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 1); + + auto nodes_iter = nodes.begin(); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4); + UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5); + + // Test an Op that uses outputs from multiple previous Ops. + // Model graph (omit leftmost part): + // + // ... ----- (this is t2) --+--> AddOp --> t5 + // ... | | + // ... -+-> ReluOp --> t4 --+ | + // ... | | | + // ... -+ | | + // ... TensorOp --> tw --+ | + // ... | + // | + // TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9 + // | + // TensorOp --> t7 --+ + // | + // TensorOp --> tu --+ + // + // (tw and tu are output references, hidden from the code) + // + + ark::ModelTensorRef t6 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t7 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t8 = model.add(t6, t7); + ark::ModelTensorRef t9 = model.add(t5, t8); + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,ReluOp,AddOp,) --+ + // | + // (AddOp,) --+--> (AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + + nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 3); + + nodes_iter = nodes.begin(); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4); + UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_3;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_4;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9); + + // Test an Op that uses a single input tensor for multiple inputs. + // Model graph (omit leftmost part): + // + // ... ----- (this is t2) --+--> AddOp --> t5 + // ... | | + // ... -+-> ReluOp --> t4 --+ | + // ... | | | + // ... -+ | | + // ... TensorOp --> tw --+ | + // ... | + // | + // TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9 + // | + // TensorOp --> t7 --+ + // | + // TensorOp --> tu --+ + // + // TensorOp --> t10 --+--> AddOp --> t11 + // | ^ ^ + // | | | + // +----+ | + // | + // TensorOp --> tv -----------+ + // + // (tw, tu, and tv are output references, hidden from the code) + // + + ark::ModelTensorRef t10 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t11 = model.add(t10, t10); + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,ReluOp,AddOp,) --+ + // | + // (AddOp,) --+--> (AddOp,) + // + // (AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + + nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 4); + + nodes_iter = nodes.begin(); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4); + UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_3;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_4;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_5;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t11); + + // Test using previous Ops' outputs from multiple different Ops. + // Model graph (omit leftmost part): + // + // ... ----- (this is t2) --+--> AddOp --> t5 + // ... | | + // ... -+-> ReluOp --> t4 --+ | + // ... | | | + // ... -+ | | + // ... TensorOp --> tw --+ | + // ... | + // | + // TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9 + // | | + // TensorOp --> t7 --+ +--> AddOp --> t12 + // | + // TensorOp --> tu --+ + // + // TensorOp --> t10 --+--> AddOp --> t11 + // | ^ ^ + // | | | + // +----+ | + // | + // TensorOp --> tv -----------+ + // + // (tw, tu, and tv are output references, hidden from the code) + // + + ark::ModelTensorRef t12 = model.add(t5, t8); + UNITTEST_TRUE(model.verify()); + + // OpNode graph (parentheses indicate a OpNode): + // + // (AddOp,AddOp,ReluOp,AddOp,) --+--> (AddOp,) + // | + // (AddOp,) --+--> (AddOp,) + // + // (AddOp,) + // + + compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + + nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 5); + + nodes_iter = nodes.begin(); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3); + UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4); + UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_3;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_4;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_5;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t11); + node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "add_6;"); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t12); + + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_model_dependent_inputs() { + ark::Model m; + + ark::ModelTensorRef ones = m.tensor({256, 256}, ark::FP16); + ark::ModelTensorRef x0 = m.scale(m.scale(ones, 2), 2); + ark::ModelTensorRef x1 = m.scale(m.scale(x0, 2), 2); + + ark::ModelTensorRef x2 = m.mul(ones, x1); + ark::ModelTensorRef x3 = m.mul(ones, x1); + ark::ModelTensorRef x4 = m.mul(x2, x3); + ark::ModelTensorRef y = m.add(x0, x4); + + auto compressed = m.compress(); + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 4); + auto nodes_iter = nodes.begin(); + auto node = (nodes_iter++)->get(); + UNITTEST_EQ(node->ops.size(), 4); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], x0); + UNITTEST_EQ(node->ops[3]->result_tensors()[0], x1); + UNITTEST_EQ(node->consumers.size(), 3); + UNITTEST_EQ(node->producers.size(), 0); + node = (nodes_iter++)->get(); + UNITTEST_EQ(node->ops.size(), 1); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], x2); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], ones); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], x1); + UNITTEST_EQ(node->consumers.size(), 1); + UNITTEST_EQ(node->producers.size(), 1); + node = (nodes_iter++)->get(); + UNITTEST_EQ(node->ops.size(), 1); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], x3); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], ones); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], x1); + UNITTEST_EQ(node->consumers.size(), 1); + UNITTEST_EQ(node->producers.size(), 1); + node = (nodes_iter++)->get(); + UNITTEST_EQ(node->ops.size(), 2); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], x4); + UNITTEST_EQ(node->ops[0]->read_tensors()[0], x2); + UNITTEST_EQ(node->ops[0]->read_tensors()[1], x3); + UNITTEST_EQ(node->ops[1]->result_tensors()[0], y); + UNITTEST_EQ(node->ops[1]->read_tensors()[0], x0); + UNITTEST_EQ(node->ops[1]->read_tensors()[1], x4); + UNITTEST_EQ(node->consumers.size(), 0); + UNITTEST_EQ(node->producers.size(), 3); + + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_model_noop() { + ark::Model model; + model.tensor({1}, ark::FP32); + model.tensor({1}, ark::FP32); + model.tensor({1}, ark::FP32); + + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(); + UNITTEST_TRUE(compressed.verify()); + UNITTEST_EQ(compressed.nodes().size(), 0); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_model_cumulate() { + // OpNode graph (parentheses indicate a OpNode): + // + // (Relu,) --+ (Relu,) --+ + // | | + // (Relu,Add,) --+--> (Add,) --+--> (Add,) + // + + ark::Model model; + ark::ModelTensorRef cumulate = model.tensor({1}, ark::FP32); + + for (int i = 0; i < 3; ++i) { + ark::ModelTensorRef t = model.tensor({1}, ark::FP32); + ark::ModelTensorRef r = model.relu(t); + cumulate = model.add(cumulate, r); + } + + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(); + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 5); + + auto last_node = nodes.back().get(); + UNITTEST_EQ(last_node->ops[0]->result_tensors()[0], cumulate); + UNITTEST_EQ(last_node->producers.size(), 2); + UNITTEST_EQ(last_node->consumers.size(), 0); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_model_basics); + UNITTEST(test_model_dependent_inputs); + UNITTEST(test_model_noop); + UNITTEST(test_model_cumulate); + return 0; +} diff --git a/ark/model/named_type.cpp b/ark/model/named_type.cpp new file mode 100644 index 000000000..2f6bdb31a --- /dev/null +++ b/ark/model/named_type.cpp @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "named_type.hpp" + +namespace ark { + +bool operator==(const NamedT &lhs, const NamedT &rhs) { + return lhs.type_name() == rhs.type_name(); +} + +} // namespace ark diff --git a/ark/model/named_type.hpp b/ark/model/named_type.hpp new file mode 100644 index 000000000..344cc6980 --- /dev/null +++ b/ark/model/named_type.hpp @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_NAMED_TYPE_HPP_ +#define ARK_NAMED_TYPE_HPP_ + +#include + +namespace ark { + +class NamedT { + public: + NamedT(const std::string &type_name) : type_name_(type_name) {} + NamedT &operator=(const NamedT &) = default; + + const std::string &type_name() const { return type_name_; } + + private: + std::string type_name_; +}; + +bool operator==(const NamedT &lhs, const NamedT &rhs); + +} // namespace ark + +#endif // ARK_NAMED_TYPE_HPP_ diff --git a/ark/ops/ops_all_reduce.cpp b/ark/ops/ops_all_reduce.cpp new file mode 100644 index 000000000..647c50d1e --- /dev/null +++ b/ark/ops/ops_all_reduce.cpp @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "math_utils.h" +#include "ops_common.hpp" + +namespace ark { + +ModelTensorRef Model::all_reduce(ModelTensorRef input, int gpu_id, int gpu_num, + ModelTensorRef output, const std::string &) { + if (!input->is_sequential()) { + LOG(WARN, + "all_reduce may not work correctly if the input tensor is " + "not contiguous"); + } + ModelTensorRef prev_recv; + ModelTensorRef cumulate = input; + for (int i = 1; i < gpu_num; i++) { + int gpu_dst = (gpu_id + i) % gpu_num; + int gpu_src = (gpu_id + gpu_num - i) % gpu_num; + ModelTensorRef send_data; + if (prev_recv) { + send_data = this->identity(input, {prev_recv}); + } else { + send_data = input; + } + send_data = this->send(send_data, gpu_id, gpu_dst); + ModelTensorRef send_done_tensor = + this->send_done(send_data, gpu_id, gpu_dst); + ModelTensorRef recv_buf = + this->tensor(input->shape(), input->data_type()); + recv_buf = this->identity(recv_buf, {send_done_tensor}); + ModelTensorRef recv = this->recv(gpu_src, gpu_src, 0, recv_buf); + prev_recv = recv; + cumulate = this->add(cumulate, recv); + } + return cumulate; +} + +} // namespace ark diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp new file mode 100644 index 000000000..26420d5ee --- /dev/null +++ b/ark/ops/ops_all_reduce_test.cpp @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" +#include "logging.h" +#include "model/model_node.hpp" +#include "model/model_op.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_model_op_all_reduce() { + // OpNode graph (parentheses indicate a OpNode): + // + // +--> (S,SD,R,) --+--> (S,SD,R,) --+ + // | | | + // (S,SD,R,) --+--> (Add,) +--> (Add,) +--> (Add,) + // | ^ | ^ + // | | | | + // +---------------+ +--------------+ + + ark::Model model; + ark::ModelTensorRef input = model.tensor({1}, ark::FP32); + ark::ModelTensorRef output = model.all_reduce(input, 0, 4); + + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(); + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 6); + + auto nodes_iter = nodes.begin(); + auto node = *(nodes_iter++); + // UNITTEST_EQ(node->get_name(), "send;send_done;recv;"); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 2); + + // UNITTEST_EQ(node->consumers[0]->get_name(), "add;"); + UNITTEST_EQ(node->consumers[0]->consumers.size(), 1); + // UNITTEST_EQ((*(node->consumers[0]->consumers.begin()))->get_name(), + // "add_1;"); + + // UNITTEST_EQ(node->consumers[1]->get_name(), + // "send_1;send_done_1;recv_1;"); + UNITTEST_EQ(node->consumers[1]->producers.size(), 1); + UNITTEST_EQ(node->consumers[1]->consumers.size(), 2); + + node = node->consumers[1]; + + // UNITTEST_EQ(node->consumers[0]->get_name(), "add_1;"); + UNITTEST_EQ(node->consumers[0]->producers.size(), 2); + UNITTEST_EQ(node->consumers[0]->consumers.size(), 1); + // UNITTEST_EQ((*(node->consumers[0]->consumers.begin()))->get_name(), + // "add_2;"); + + // UNITTEST_EQ(node->consumers[1]->get_name(), + // "send_2;send_done_2;recv_2;"); + UNITTEST_EQ(node->consumers[1]->producers.size(), 1); + UNITTEST_EQ(node->consumers[1]->consumers.size(), 1); + // UNITTEST_EQ((*(node->consumers[1]->consumers.begin()))->get_name(), + // "add_2;"); + UNITTEST_EQ( + (*(node->consumers[1]->consumers.begin()))->ops[0]->result_tensors()[0], + output); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_model_op_all_reduce); + return 0; +} diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp new file mode 100644 index 000000000..992b60128 --- /dev/null +++ b/ark/ops/ops_arithmetic.cpp @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_arithmetic.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpArithmetic::ModelOpArithmetic(const std::string &type_name, + ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output) + : ModelOp(type_name) { + check_match_data_type(input, other); + if (output) { + check_match_data_type(input, output); + } + Dims output_shape = broadcast_shape(input->shape(), other->shape()); + if (output) { + check_shape(output, output_shape); + } else { + output = std::make_shared( + input->data_type(), std::make_shared(), output_shape); + } + ModelTensorRef result = std::make_shared(*output); + + read_tensors_ = {input, other}; + write_tensors_ = {output}; + result_tensors_ = {result}; + + verify(); +} + +ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output) + : ModelOpArithmetic("Add", input, other, output) {} + +ModelTensorRef Model::add(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, const std::string &name) { + return impl_->create_op(name, input, other, output) + ->result_tensors()[0]; +} + +ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output) + : ModelOpArithmetic("Mul", input, other, output) {} + +ModelTensorRef Model::mul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, const std::string &name) { + return impl_->create_op(name, input, other, output) + ->result_tensors()[0]; +} + +ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output) + : ModelOpArithmetic("Sub", input, other, output) {} + +ModelTensorRef Model::sub(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, const std::string &name) { + return impl_->create_op(name, input, other, output) + ->result_tensors()[0]; +} + +ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output) + : ModelOpArithmetic("Div", input, other, output) {} + +ModelTensorRef Model::div(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, const std::string &name) { + return impl_->create_op(name, input, other, output) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_arithmetic.hpp b/ark/ops/ops_arithmetic.hpp new file mode 100644 index 000000000..ea5886381 --- /dev/null +++ b/ark/ops/ops_arithmetic.hpp @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_ARITHMETIC_HPP_ +#define ARK_OPS_ARITHMETIC_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpArithmetic : public ModelOp { + public: + ModelOpArithmetic() = default; + ModelOpArithmetic(const std::string &type_name, ModelTensorRef input, + ModelTensorRef other, ModelTensorRef output); +}; + +class ModelOpAdd : public ModelOpArithmetic { + public: + ModelOpAdd() = default; + ModelOpAdd(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output); +}; + +class ModelOpMul : public ModelOpArithmetic { + public: + ModelOpMul() = default; + ModelOpMul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output); +}; + +class ModelOpSub : public ModelOpArithmetic { + public: + ModelOpSub() = default; + ModelOpSub(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output); +}; + +class ModelOpDiv : public ModelOpArithmetic { + public: + ModelOpDiv() = default; + ModelOpDiv(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output); +}; + +} // namespace ark + +#endif // ARK_OPS_ARITHMETIC_HPP_ diff --git a/ark/ops/ops_common.cpp b/ark/ops/ops_common.cpp new file mode 100644 index 000000000..6cc269aae --- /dev/null +++ b/ark/ops/ops_common.cpp @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_common.hpp" + +#include +#include +#include + +#include "logging.h" + +namespace ark { + +void check_match_data_type(ModelTensorRef a, ModelTensorRef b) { + if (a->data_type() != b->data_type()) { + ERR(InvalidUsageError, + "data types mismatch: ", a->data_type()->type_name(), ", ", + b->data_type()->type_name()); + } +} + +void check_match_shape(ModelTensorRef a, ModelTensorRef b) { + if (a->shape() != b->shape()) { + ERR(InvalidUsageError, "shapes mismatch: ", a->shape(), ", ", + b->shape()); + } +} + +void check_shape(ModelTensorRef tensor, const Dims &shape) { + if (tensor->shape() != shape) { + ERR(InvalidUsageError, "shape mismatch: ", tensor->shape(), " and ", + shape); + } +} + +Dims broadcast_shape(const Dims &dims1, const Dims &dims2) { + std::vector output_dims_reversed; + int ndims = std::max(dims1.ndims(), dims2.ndims()); + for (int i = 1; i < ndims + 1; ++i) { + int d1 = (i - 1 < dims1.ndims()) ? dims1[-i] : 1; + int d2 = (i - 1 < dims2.ndims()) ? dims2[-i] : 1; + if (d1 == d2) { + output_dims_reversed.push_back(d1); + } else if (d1 == 1) { + output_dims_reversed.push_back(d2); + } else if (d2 == 1) { + output_dims_reversed.push_back(d1); + } else { + ERR(InvalidUsageError, + "input and other cannot be broadcasted: ", dims1, ", ", dims2); + } + } + std::reverse(output_dims_reversed.begin(), output_dims_reversed.end()); + return Dims{output_dims_reversed}; +} + +} // namespace ark diff --git a/ark/ops/ops_common.hpp b/ark/ops/ops_common.hpp new file mode 100644 index 000000000..b6b9c6966 --- /dev/null +++ b/ark/ops/ops_common.hpp @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_COMMON_HPP_ +#define ARK_OPS_COMMON_HPP_ + +#include + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "logging.h" +#include "model/model_graph_impl.hpp" +#include "model/model_op.hpp" +#include "model/model_tensor.hpp" + +namespace ark { + +void check_match_data_type(ModelTensorRef a, ModelTensorRef b); + +void check_match_shape(ModelTensorRef a, ModelTensorRef b); + +void check_shape(ModelTensorRef tensor, const Dims &shape); + +/// Return the output shape of broadcasting between two shapes. +/// Follow NumPy rules. +/// https://numpy.org/doc/stable/user/basics.broadcasting.html +/// @param dims1 The first shape. +/// @param dims2 The second shape. +Dims broadcast_shape(const Dims &dims1, const Dims &dims2); + +} // namespace ark + +#endif // ARK_OPS_COMMON_HPP_ diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp new file mode 100644 index 000000000..8871316b3 --- /dev/null +++ b/ark/ops/ops_identity.cpp @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_identity.hpp" + +#include + +#include "ops_common.hpp" + +namespace ark { + +ModelOpIdentity::ModelOpIdentity(ModelTensorRef input, + const std::vector &deps) + : ModelOpTensor(input->buffer(), input->shape(), input->data_type(), + input->strides(), input->offsets(), input->pads(), + input->exported(), input->imported_rank()) { + std::set dep_set; + dep_set.emplace(input); + read_tensors_.emplace_back(input); + for (auto &dep : deps) { + if (dep_set.emplace(dep).second) { + read_tensors_.emplace_back(dep); + } + } + + verify(); +} + +ModelTensorRef Model::identity(ModelTensorRef input, + const std::vector &deps, + const std::string &name) { + return impl_->create_op(name, input, deps) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_identity.hpp b/ark/ops/ops_identity.hpp new file mode 100644 index 000000000..3bef04623 --- /dev/null +++ b/ark/ops/ops_identity.hpp @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_IDENTITY_HPP_ +#define ARK_OPS_IDENTITY_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" +#include "ops_tensor.hpp" + +namespace ark { + +class ModelOpIdentity : public ModelOpTensor { + public: + ModelOpIdentity() = default; + ModelOpIdentity(ModelTensorRef input, + const std::vector &deps); +}; + +} // namespace ark + +#endif // ARK_OPS_IDENTITY_HPP_ diff --git a/ark/ops/ops_identity_test.cpp b/ark/ops/ops_identity_test.cpp new file mode 100644 index 000000000..033092db8 --- /dev/null +++ b/ark/ops/ops_identity_test.cpp @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" +#include "logging.h" +#include "model/model_node.hpp" +#include "model/model_op.hpp" +#include "model/model_tensor.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_model_op_identity() { + // OpNode graph (parentheses indicate a OpNode): + // + // (Relu,) --+ + // | + // (Relu,) --+--> (Relu,) + // + + ark::Model model; + ark::ModelTensorRef t0 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t1 = model.tensor({1}, ark::FP32); + ark::ModelTensorRef t2 = model.tensor({1}, ark::FP32); + + ark::ModelTensorRef r0 = model.relu(t0); + ark::ModelTensorRef r1 = model.relu(t1); + ark::ModelTensorRef t3 = model.identity(t2, {r0, r1}); + + ark::ModelTensorRef t4 = model.relu(t3); + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(); + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 3); + + auto nodes_iter = nodes.begin(); + auto node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], r0); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 1); + + node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], r1); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 1); + + node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t4); + UNITTEST_EQ(node->producers.size(), 2); + UNITTEST_EQ(node->consumers.size(), 0); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_model_op_identity); + return 0; +} diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp new file mode 100644 index 000000000..7340a8308 --- /dev/null +++ b/ark/ops/ops_math.cpp @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_math.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input, + ModelTensorRef output) + : ModelOp(type_name) { + if (output) { + check_match_data_type(input, output); + check_match_shape(input, output); + } else { + output = std::make_shared(input->data_type(), + std::make_shared(), + input->shape()); + } + ModelTensorRef result = std::make_shared(*output); + + read_tensors_ = {input}; + write_tensors_ = {output}; + result_tensors_ = {result}; + + verify(); +} + +ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output) + : ModelOpMath("Exp", input, output) {} + +ModelTensorRef Model::exp(ModelTensorRef input, ModelTensorRef output, + const std::string &name) { + return impl_->create_op(name, input, output) + ->result_tensors()[0]; +} + +ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output) + : ModelOpMath("Relu", input, output) {} + +ModelTensorRef Model::relu(ModelTensorRef input, ModelTensorRef output, + const std::string &name) { + return impl_->create_op(name, input, output) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_math.hpp b/ark/ops/ops_math.hpp new file mode 100644 index 000000000..7f80b6e53 --- /dev/null +++ b/ark/ops/ops_math.hpp @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_MATH_HPP_ +#define ARK_OPS_MATH_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpMath : public ModelOp { + public: + ModelOpMath() = default; + ModelOpMath(const std::string &type_name, ModelTensorRef input, + ModelTensorRef output); +}; + +class ModelOpExp : public ModelOpMath { + public: + ModelOpExp() = default; + ModelOpExp(ModelTensorRef input, ModelTensorRef output); +}; + +class ModelOpRelu : public ModelOpMath { + public: + ModelOpRelu() = default; + ModelOpRelu(ModelTensorRef input, ModelTensorRef output); +}; + +} // namespace ark + +#endif // ARK_OPS_MATH_HPP_ diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp new file mode 100644 index 000000000..baeed3fac --- /dev/null +++ b/ark/ops/ops_matmul.cpp @@ -0,0 +1,143 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_matmul.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpMatmul::ModelOpMatmul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, bool trans_input, + bool trans_other) + : ModelOp("Matmul") { + // Shape verification. + const Dims &shp_a = input->shape(); + const Dims &shp_b = other->shape(); + int ndims_a = shp_a.ndims(); + int ndims_b = shp_b.ndims(); + + if (ndims_a < 1) { + ERR(InvalidUsageError, "input has an empty shape: ", shp_a); + } + if (ndims_b < 1) { + ERR(InvalidUsageError, "other has an empty shape: ", shp_b); + } + + // m: the number of rows of output matrix (row-major) + // n: the number of columns of output matrix (row-major) + // k: the inner dimension of matrix multiplication + DimType m; + DimType n; + DimType k; + DimType k2; + + m = (ndims_a == 1) ? 1 : shp_a[ndims_a - 2]; + k = shp_a[ndims_a - 1]; + if (trans_input) { + DimType tmp = m; + m = k; + k = tmp; + } + n = (ndims_b == 1) ? 1 : shp_b[ndims_b - 1]; + k2 = (ndims_b == 1) ? shp_b[0] : shp_b[ndims_b - 2]; + if (trans_other) { + DimType tmp = n; + n = k2; + k2 = tmp; + } + if (k != k2) { + ERR(InvalidUsageError, "inner dimensions mismatch: ", k, " and ", k2); + } + + check_match_data_type(input, other); + if (output) { + check_match_data_type(input, output); + } + + // N and C dimensions of matrix A + Dims nca{1, 1}; + if (ndims_a == 4) { + nca[0] = shp_a[0]; + nca[1] = shp_a[1]; + } else if (ndims_a == 3) { + nca[1] = shp_a[0]; + } + + // N and C dimensions of matrix B + Dims ncb{1, 1}; + if (ndims_b == 4) { + ncb[0] = shp_b[0]; + ncb[1] = shp_b[1]; + } else if (ndims_b == 3) { + ncb[1] = shp_b[0]; + } + + // Verify broadcasting + if (nca[0] != ncb[0] && nca[0] != 1 && ncb[0] != 1) { + ERR(InvalidUsageError, "N dimension mismatch: ", nca[0], " and ", + ncb[0]); + } + if (nca[1] != ncb[1] && nca[1] != 1 && ncb[1] != 1) { + ERR(InvalidUsageError, "C dimension mismatch: ", nca[1], " and ", + ncb[1]); + } + + // N and C dimension of output matrix + Dims ncc{std::max(nca[0], ncb[0]), std::max(nca[1], ncb[1])}; + + Dims output_shape; + if (std::max(ndims_a, ndims_b) == 4) { + output_shape = Dims{ncc[0], ncc[1], m, n}; + } else if (std::max(ndims_a, ndims_b) == 3) { + output_shape = Dims{ncc[1], m, n}; + } else { + output_shape = Dims{m, n}; + } + + // Create an output Tensor. + if (output) { + check_shape(output, output_shape); + } else { + output = std::make_shared( + input->data_type(), std::make_shared(), output_shape); + } + ModelTensorRef result = std::make_shared(*output); + + const Dims &strides_a = input->strides(); + const Dims &strides_b = other->strides(); + const Dims &strides_y = output->strides(); + // NOTE: `strides_mnk` here is just an expected value. We can + // calculate the exact value only after a specific implementation is + // determined. + Dims strides_mnk{ + trans_input ? strides_a[ndims_a - 2] : strides_a[ndims_a - 1], + strides_y[strides_y.ndims() - 1], strides_y[strides_y.ndims() - 1], + trans_other ? strides_b[ndims_b - 2] : strides_b[ndims_b - 1]}; + + // a.k.a. problem size + Dims shapes_mnk{m, n, k}; + + read_tensors_ = {input, other}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_["InputDimNC"] = nca; + args_["OtherDimNC"] = ncb; + args_["ShapesMNK"] = shapes_mnk; + args_["StridesMNK"] = strides_mnk; + args_["IsInputColumnMajor"] = trans_input; + args_["IsOtherColumnMajor"] = trans_other; + + verify(); +} + +ModelTensorRef Model::matmul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, bool trans_input, + bool trans_other, const std::string &name) { + return impl_ + ->create_op(name, input, other, output, trans_input, + trans_other) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_matmul.hpp b/ark/ops/ops_matmul.hpp new file mode 100644 index 000000000..f8b98ae56 --- /dev/null +++ b/ark/ops/ops_matmul.hpp @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_MATMUL_HPP_ +#define ARK_OPS_MATMUL_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpMatmul : public ModelOp { + public: + ModelOpMatmul() = default; + ModelOpMatmul(ModelTensorRef input, ModelTensorRef other, + ModelTensorRef output, bool trans_input, bool trans_other); +}; + +} // namespace ark + +#endif // ARK_OPS_MATMUL_HPP_ diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp new file mode 100644 index 000000000..c78e1ed63 --- /dev/null +++ b/ark/ops/ops_matmul_test.cpp @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +#include "ark/model.hpp" +#include "logging.h" +#include "model/model_node.hpp" +#include "model/model_op.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_model_op_matmul() { + // Hidden dimension of the dense layer. + unsigned int units = 1024; + // Input dimension of the dense layer. + unsigned int in_dim = 1024; + // Extra dimension of the input. CHANNEL=1 for 2D inputs. + unsigned int channel = 128; + // Batch size of the input. + unsigned int batch_size = 1; + + ark::Model m; + ark::ModelTensorRef input = + m.tensor({batch_size, channel, in_dim}, ark::FP16); + ark::ModelTensorRef weight = m.tensor({in_dim, units}, ark::FP16); + m.matmul(input, weight); + + UNITTEST_TRUE(m.verify()); + auto compressed = m.compress(); + UNITTEST_TRUE(compressed.verify()); + + return ark::unittest::SUCCESS; +} + +// ark::unittest::State test_model_op_split_matmul() { +// // OpNode graph (parentheses indicate a OpNode): +// // +// // (Matmul,) --+ +// // | +// // (Matmul,) --+--> (Reduce,) +// // + +// ark::Model model; +// ark::ModelTensorRef t0 = model.tensor({64, 128}, ark::FP16); +// ark::ModelTensorRef t1 = model.tensor({128, 64}, ark::FP16); +// model.matmul(t0, t1, nullptr, 2, false, false, "matmul", 3); +// UNITTEST_TRUE(model.verify()); + +// auto compressed = model.compress(); +// auto nodes = compressed.nodes(); +// UNITTEST_EQ(nodes.size(), 3); + +// auto nodes_iter = nodes.begin(); +// auto node = (nodes_iter++)->get(); +// // UNITTEST_EQ(node->ops[0]->name, "matmul/matmul_shard_0"); +// UNITTEST_EQ(node->producers.size(), 0); +// UNITTEST_EQ(node->consumers.size(), 1); + +// node = (nodes_iter++)->get(); +// // UNITTEST_EQ(node->ops[0]->name, "matmul/matmul_shard_1"); +// UNITTEST_EQ(node->producers.size(), 0); +// UNITTEST_EQ(node->consumers.size(), 1); + +// node = (nodes_iter++)->get(); +// // UNITTEST_EQ(node->ops[0]->name, "matmul/reduce_sum"); +// UNITTEST_EQ(node->producers.size(), 2); +// UNITTEST_EQ(node->consumers.size(), 0); + +// return ark::unittest::SUCCESS; +// } + +int main() { + UNITTEST(test_model_op_matmul); + // UNITTEST(test_model_op_split_matmul); + return 0; +} diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp new file mode 100644 index 000000000..fca1f8566 --- /dev/null +++ b/ark/ops/ops_refer.cpp @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_refer.hpp" + +#include + +#include "ops_common.hpp" + +namespace ark { + +ModelOpRefer::ModelOpRefer(ModelTensorRef input, const Dims &shape, + const Dims &strides, const Dims &offsets, + const Dims &pads) + : ModelOpTensor(input->buffer(), shape, input->data_type(), strides, + offsets, pads, input->exported(), input->imported_rank()) { + read_tensors_ = {input}; + verify(); +} + +ModelTensorRef Model::refer(ModelTensorRef input, const Dims &shape, + const Dims &strides, const Dims &offsets, + const Dims &pads, const std::string &name) { + return impl_ + ->create_op(name, input, shape, strides, offsets, pads) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_refer.hpp b/ark/ops/ops_refer.hpp new file mode 100644 index 000000000..84d6ae362 --- /dev/null +++ b/ark/ops/ops_refer.hpp @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_REFER_HPP_ +#define ARK_OPS_REFER_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" +#include "ops_tensor.hpp" + +namespace ark { + +class ModelOpRefer : public ModelOpTensor { + public: + ModelOpRefer() = default; + ModelOpRefer(ModelTensorRef input, const Dims &shape, const Dims &strides, + const Dims &offsets, const Dims &pads); +}; + +} // namespace ark + +#endif // ARK_OPS_REFER_HPP_ diff --git a/ark/ops/ops_scale.cpp b/ark/ops/ops_scale.cpp new file mode 100644 index 000000000..cf4d29566 --- /dev/null +++ b/ark/ops/ops_scale.cpp @@ -0,0 +1,69 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_scale.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpScale::ModelOpScale(ModelTensorRef input, float val, + ModelTensorRef output) + : ModelOp("Scale") { + if (output) { + check_match_data_type(input, output); + check_match_shape(input, output); + } else { + output = std::make_shared(input->data_type(), + std::make_shared(), + input->shape()); + } + ModelTensorRef result = std::make_shared(*output); + + read_tensors_ = {input}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_ = {{"Factor", val}}; + + verify(); +} + +std::string ModelOpScale::impl_name(const nlohmann::json &config) const { + if (!config.contains("NumWarps")) { + ERR(InvalidUsageError, "NumWarps is required for Scale"); + } else if (!config.contains("Tile")) { + ERR(InvalidUsageError, "Tile is required for Scale"); + } + int num_warps = config["NumWarps"]; + auto &tile_shape = config["Tile"]; + Dims unit_out_dims{tile_shape[0], tile_shape[1]}; + + std::vector template_args; + template_args.emplace_back(vec_string(read_tensors_[0]->strides().dims4())); + template_args.emplace_back(vec_string(read_tensors_[0]->shape().dims4())); + template_args.emplace_back( + vec_string(write_tensors_[0]->strides().dims4())); + template_args.emplace_back(vec_string(write_tensors_[0]->shape().dims4())); + template_args.emplace_back(vec_string(unit_out_dims.dims4())); + template_args.emplace_back(std::to_string(num_warps)); + template_args.emplace_back(std::to_string(0)); + return function_name_string("scale", template_args); +} + +std::vector ModelOpScale::impl_args( + [[maybe_unused]] const nlohmann::json &config) const { + float factor = args_.at("Factor").value(); + std::vector args; + args.emplace_back(result_tensors_[0]); + args.emplace_back(read_tensors_[0]); + args.emplace_back(factor); + return args; +} + +ModelTensorRef Model::scale(ModelTensorRef input, float val, + ModelTensorRef output, const std::string &name) { + return impl_->create_op(name, input, val, output) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_scale.hpp b/ark/ops/ops_scale.hpp new file mode 100644 index 000000000..937028b14 --- /dev/null +++ b/ark/ops/ops_scale.hpp @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_SCALE_HPP_ +#define ARK_OPS_SCALE_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpScale : public ModelOp { + public: + ModelOpScale() = default; + ModelOpScale(ModelTensorRef input, float val, ModelTensorRef output); + + std::string impl_name(const nlohmann::json &config) const override; + + std::vector impl_args( + [[maybe_unused]] const nlohmann::json &config) const override; +}; + +} // namespace ark + +#endif // ARK_OPS_SCALE_HPP_ diff --git a/ark/ops/ops_sendrecv.cpp b/ark/ops/ops_sendrecv.cpp new file mode 100644 index 000000000..4cff2d818 --- /dev/null +++ b/ark/ops/ops_sendrecv.cpp @@ -0,0 +1,100 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_sendrecv.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpSend::ModelOpSend(ModelTensorRef input, int sid, int rank, int dst_rank, + DimType bytes) + : ModelOp("Send") { + DimType max_bytes = input->strides().size() * input->data_type()->bytes(); + if (max_bytes < bytes) { + LOG(ERROR, "invalid bytes: ", bytes, ", max: ", max_bytes); + } + if (bytes == 0) { + bytes = max_bytes; + } + input->set_exported(); + + ModelTensorRef recvbuf = std::make_shared( + input->data_type(), std::make_shared(), input->shape()); + recvbuf->set_imported_rank(dst_rank); + + ModelTensorRef result = std::make_shared(*recvbuf); + + read_tensors_ = {input}; + write_tensors_ = {recvbuf}; + result_tensors_ = {result}; + args_["Rank"] = rank; + args_["DstRank"] = dst_rank; + args_["Bytes"] = bytes; + args_["Sid"] = sid; + + verify(); +} + +ModelTensorRef Model::send(ModelTensorRef input, int sid, int dst_rank, + DimType bytes, const std::string &name) { + return impl_ + ->create_op(name, input, sid, rank_, dst_rank, bytes) + ->result_tensors()[0]; +} + +ModelOpSendDone::ModelOpSendDone(ModelTensorRef input, int rank, int dst_rank) + : ModelOp("SendDone") { + ModelTensorRef result = std::make_shared(*input); + read_tensors_ = {}; + write_tensors_ = {input}; + result_tensors_ = {result}; + args_["Rank"] = rank; + args_["DstRank"] = dst_rank; + + verify(); +} + +ModelTensorRef Model::send_done(ModelTensorRef input, int, int dst_rank, + const std::string &name) { + return impl_->create_op(name, input, rank_, dst_rank) + ->result_tensors()[0]; +} + +ModelOpRecv::ModelOpRecv(ModelTensorRef output, int, int rank, int src_rank, + DimType bytes) + : ModelOp("Recv") { + if (output == nullptr) { + if (bytes == 0) { + LOG(ERROR, "receive bytes cannot be 0"); + } + output = std::make_shared( + BYTE, std::make_shared(), Dims{bytes}); + } + output->set_exported(); + DimType max_bytes = output->shape().size() * output->data_type()->bytes(); + if (max_bytes < bytes) { + LOG(ERROR, "invalid bytes: ", bytes, ", max: ", max_bytes); + } + if (bytes == 0) { + bytes = max_bytes; + } + ModelTensorRef result = std::make_shared(*output); + + read_tensors_ = {}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_["Rank"] = rank; + args_["SrcRank"] = src_rank; + + verify(); +} + +ModelTensorRef Model::recv(int sid, int src_rank, DimType bytes, + ModelTensorRef output, const std::string &name) { + return impl_ + ->create_op(name, output, sid, rank_, src_rank, bytes) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_sendrecv.hpp b/ark/ops/ops_sendrecv.hpp new file mode 100644 index 000000000..5874f99fc --- /dev/null +++ b/ark/ops/ops_sendrecv.hpp @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_SENDRECV_HPP_ +#define ARK_OPS_SENDRECV_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpSend : public ModelOp { + public: + ModelOpSend() = default; + ModelOpSend(ModelTensorRef input, int sid, int rank, int dst_rank, + DimType bytes); +}; + +class ModelOpSendDone : public ModelOp { + public: + ModelOpSendDone() = default; + ModelOpSendDone(ModelTensorRef input, int rank, int dst_rank); +}; + +class ModelOpRecv : public ModelOp { + public: + ModelOpRecv() = default; + ModelOpRecv(ModelTensorRef output, int, int rank, int src_rank, + DimType bytes); +}; + +} // namespace ark + +#endif // ARK_OPS_SENDRECV_HPP_ diff --git a/ark/ops/ops_sharding.cpp b/ark/ops/ops_sharding.cpp new file mode 100644 index 000000000..83f184555 --- /dev/null +++ b/ark/ops/ops_sharding.cpp @@ -0,0 +1,55 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "math_utils.h" +#include "ops_common.hpp" + +namespace ark { + +// Shard `input` along `axis` into `dim_per_shard`-dimensional shards. +std::vector Model::sharding(ModelTensorRef input, DimType axis, + DimType dim_per_shard, + const std::string &name) { + if (axis >= DIMS_LEN) { + ERR(InvalidUsageError, "invlaid axis value: ", axis); + } + if ((input->shape()[axis] % dim_per_shard) != 0) { + // If the total dimension is not divided by the per-shard size, + // we need to check whether we can put a padding here. + // If the padded dimension of the input tensor is smaller than + // the leading dimension size, it means that the input tensor refers to + // a part of a buffer -- in this case, we cannot put a padding because + // the tensor has adjacent data. + DimType pdim = math::pad(input->shape()[axis], input->pads()[axis]); + if (pdim < input->strides()[axis]) { + ERR(InvalidUsageError, "the dimension of axis ", axis, " (", + input->shape()[axis], + ") is not divided by the dimension per shard (", dim_per_shard, + ") and this tensor cannot be padded."); + } + } + std::vector shards; + DimType num_shard = math::div_up(input->shape()[axis], dim_per_shard); + Dims shard_shape = input->shape(); + Dims shard_offs = input->offsets(); + Dims shard_pads = input->pads(); + for (DimType i = 0; i < num_shard; ++i) { + DimType dim; + if (i == (num_shard - 1)) { + dim = input->shape()[axis] - (i * dim_per_shard); + shard_pads[axis] = input->pads()[axis]; + } else { + dim = dim_per_shard; + shard_pads[axis] = 1; + } + shard_shape[axis] = dim; + ModelTensorRef shard = + this->refer(input, shard_shape, input->strides(), shard_offs, + shard_pads, name + "/shard_" + std::to_string(i)); + shards.emplace_back(shard); + shard_offs[axis] += dim; + } + return shards; +} + +} // namespace ark diff --git a/ark/ops/ops_sharding_test.cpp b/ark/ops/ops_sharding_test.cpp new file mode 100644 index 000000000..12ea6eca8 --- /dev/null +++ b/ark/ops/ops_sharding_test.cpp @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/model.hpp" +#include "logging.h" +#include "model/model_node.hpp" +#include "model/model_op.hpp" +#include "unittest/unittest_utils.h" + +ark::unittest::State test_model_op_sharding() { + // OpNode graph (parentheses indicate a OpNode): + // + // (Relu,) --+ + // | + // (Relu,) --+ + // | + // (Relu,) --+--> (Relu,) + // + + ark::Model model; + ark::ModelTensorRef t0 = model.tensor({3}, ark::FP32); + + std::vector vec = model.sharding(t0, 0, 1); + UNITTEST_EQ(vec.size(), 3); + + ark::ModelTensorRef t1 = vec[0]; + ark::ModelTensorRef t2 = vec[1]; + ark::ModelTensorRef t3 = vec[2]; + + ark::ModelTensorRef r0 = model.relu(t1); + ark::ModelTensorRef r1 = model.relu(t2); + ark::ModelTensorRef r2 = model.relu(t3); + + ark::ModelTensorRef t4 = model.identity(t0, {r0, r1, r2}); + + ark::ModelTensorRef t5 = model.relu(t4); + UNITTEST_TRUE(model.verify()); + + auto compressed = model.compress(); + auto nodes = compressed.nodes(); + UNITTEST_EQ(nodes.size(), 4); + + auto nodes_iter = nodes.begin(); + auto node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], r0); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 1); + + node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], r1); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 1); + + node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], r2); + UNITTEST_EQ(node->producers.size(), 0); + UNITTEST_EQ(node->consumers.size(), 1); + + node = *(nodes_iter++); + UNITTEST_EQ(node->ops[0]->result_tensors()[0], t5); + UNITTEST_EQ(node->producers.size(), 3); + UNITTEST_EQ(node->consumers.size(), 0); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_model_op_sharding); + return 0; +} diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp new file mode 100644 index 000000000..6d9abce82 --- /dev/null +++ b/ark/ops/ops_tensor.cpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ops_tensor.hpp" + +#include "ops_common.hpp" + +namespace ark { + +ModelOpTensor::ModelOpTensor(ModelBufferRef buffer, const Dims &shape, + ModelDataType data_type, const Dims &strides, + const Dims &offsets, const Dims &pads, + bool exported, int imported_rank) + : ModelOp("Tensor", true) { + if (!buffer) { + buffer = std::make_shared(); + } + + ModelTensorRef tensor = + std::make_shared(data_type, buffer, shape, strides, + offsets, pads, exported, imported_rank); + + result_tensors_.emplace_back(tensor); + + verify(); +} + +ModelTensorRef Model::tensor(const Dims &shape, ModelDataType data_type, + const Dims &strides, const Dims &offsets, + const Dims &pads, bool exported, int imported_rank, + const std::string &name) { + return impl_ + ->create_op(name, nullptr, shape, data_type, strides, + offsets, pads, exported, imported_rank) + ->result_tensors()[0]; +} + +} // namespace ark diff --git a/ark/ops/ops_tensor.hpp b/ark/ops/ops_tensor.hpp new file mode 100644 index 000000000..d575430f6 --- /dev/null +++ b/ark/ops/ops_tensor.hpp @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_OPS_TENSOR_HPP_ +#define ARK_OPS_TENSOR_HPP_ + +#include "ark/dims.hpp" +#include "ark/model.hpp" +#include "model/model_op.hpp" + +namespace ark { + +class ModelOpTensor : public ModelOp { + public: + ModelOpTensor() = default; + ModelOpTensor(ModelBufferRef buffer, const Dims &shape, + ModelDataType data_type, const Dims &strides, + const Dims &offsets, const Dims &pads, bool exported, + int imported_rank); +}; + +} // namespace ark + +#endif // ARK_OPS_TENSOR_HPP_ diff --git a/ark/ops/ops_all_gather.cc b/ark/ops_old/ops_all_gather.cc similarity index 89% rename from ark/ops/ops_all_gather.cc rename to ark/ops_old/ops_all_gather.cc index dc5202258..ee49123e6 100644 --- a/ark/ops/ops_all_gather.cc +++ b/ark/ops_old/ops_all_gather.cc @@ -2,6 +2,7 @@ // Licensed under the MIT license. #include +#include #include "logging.h" #include "math_utils.h" @@ -70,8 +71,13 @@ GatherFromPeersOp::GatherFromPeersOp(const std::string &prec_type, int rank, int npeers, size_t stride, const std::string &name) : Op(OP_GATHER_FROM_PEERS, prec_type, remote_bufs, - {trans_region_local, local_buf}, {{rank, npeers, sid, stride}}, name, - &GatherFromPeersConfigMap, -1, true) {} + {trans_region_local, local_buf}, + std::map( + {{"rank", ModelOpArg(rank)}, + {"npeers", ModelOpArg(npeers)}, + {"sid", ModelOpArg(sid)}, + {"stride", ModelOpArg(stride)}}), + name, &GatherFromPeersConfigMap, -1, true) {} std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const { Tensor *dst_buff = this->outputs[0]; @@ -80,7 +86,7 @@ std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const { int rank; int npeers; size_t stride; - this->args.get(&rank, 0); + this->args.find("rank")->second.get_value(&rank); this->args.get(&npeers, 1); this->args.get(&stride, 3); @@ -100,7 +106,8 @@ std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const { npeers, rank, stride}}); } -OpArgs GatherFromPeersOp::function_call_args(const OpConfig &) const { +std::vector GatherFromPeersOp::function_call_args( + const OpConfig &) const { int rank; int npeers; this->args.get(&rank, 0); @@ -112,20 +119,20 @@ OpArgs GatherFromPeersOp::function_call_args(const OpConfig &) const { CHECK(local_buff->buf != nullptr); - OpArgs opargs; + std::vector opargs; // gether_from_peers(dst_offset, src_offset...) - opargs.put((size_t)(local_buff->buf->get_buf_offset() + - local_buff->offset_bytes())); + opargs.emplace_back((size_t)(local_buff->buf->get_buf_offset() + + local_buff->offset_bytes())); for (int i = 0; i < MAX_PEER_NUM; i++) { if (i < npeers) { CHECK(remote_bufs[i]->buf != nullptr); - opargs.put((size_t)(remote_bufs[i]->buf->get_buf_offset() + - remote_bufs[i]->offset_bytes())); + opargs.emplace_back((size_t)(remote_bufs[i]->buf->get_buf_offset() + + remote_bufs[i]->offset_bytes())); } else { - opargs.put((size_t)0); + opargs.emplace_back((size_t)0); } } - opargs.put(local_buff); + opargs.emplace_back(local_buff); return opargs; } diff --git a/ark/ops/ops_all_gather_test.cc b/ark/ops_old/ops_all_gather_test.cc similarity index 100% rename from ark/ops/ops_all_gather_test.cc rename to ark/ops_old/ops_all_gather_test.cc diff --git a/ark/ops/ops_all_reduce.cc b/ark/ops_old/ops_all_reduce.cc similarity index 100% rename from ark/ops/ops_all_reduce.cc rename to ark/ops_old/ops_all_reduce.cc diff --git a/ark/ops/ops_all_reduce_test.cc b/ark/ops_old/ops_all_reduce_test.cc similarity index 100% rename from ark/ops/ops_all_reduce_test.cc rename to ark/ops_old/ops_all_reduce_test.cc diff --git a/ark/ops/ops_arithmetic.cc b/ark/ops_old/ops_arithmetic.cc similarity index 100% rename from ark/ops/ops_arithmetic.cc rename to ark/ops_old/ops_arithmetic.cc diff --git a/ark/ops/ops_arithmetic_test.cc b/ark/ops_old/ops_arithmetic_test.cc similarity index 100% rename from ark/ops/ops_arithmetic_test.cc rename to ark/ops_old/ops_arithmetic_test.cc diff --git a/ark/ops/ops_cast.cc b/ark/ops_old/ops_cast.cc similarity index 100% rename from ark/ops/ops_cast.cc rename to ark/ops_old/ops_cast.cc diff --git a/ark/ops/ops_cast_test.cc b/ark/ops_old/ops_cast_test.cc similarity index 100% rename from ark/ops/ops_cast_test.cc rename to ark/ops_old/ops_cast_test.cc diff --git a/ark/ops/ops_common.cc b/ark/ops_old/ops_common.cc similarity index 100% rename from ark/ops/ops_common.cc rename to ark/ops_old/ops_common.cc diff --git a/ark/ops/ops_common.h b/ark/ops_old/ops_common.h similarity index 100% rename from ark/ops/ops_common.h rename to ark/ops_old/ops_common.h diff --git a/ark/ops/ops_common_test.cc b/ark/ops_old/ops_common_test.cc similarity index 100% rename from ark/ops/ops_common_test.cc rename to ark/ops_old/ops_common_test.cc diff --git a/ark/ops/ops_copy.cc b/ark/ops_old/ops_copy.cc similarity index 100% rename from ark/ops/ops_copy.cc rename to ark/ops_old/ops_copy.cc diff --git a/ark/ops/ops_copy_test.cc b/ark/ops_old/ops_copy_test.cc similarity index 100% rename from ark/ops/ops_copy_test.cc rename to ark/ops_old/ops_copy_test.cc diff --git a/ark/ops/ops_device_sync.cc b/ark/ops_old/ops_device_sync.cc similarity index 71% rename from ark/ops/ops_device_sync.cc rename to ark/ops_old/ops_device_sync.cc index 71d808593..2c693199f 100644 --- a/ark/ops/ops_device_sync.cc +++ b/ark/ops_old/ops_device_sync.cc @@ -11,16 +11,27 @@ extern const OpConfigMap DeviceSyncConfigMap; DeviceSyncOp::DeviceSyncOp(const std::string &prec_type, Tensor *input, Tensor *output, int nranks, const std::string &name) - : Op{OP_DEVICE_SYNC, prec_type, {input}, {output}, {{nranks}}, name, - &DeviceSyncConfigMap, -1, true} {} + : Op{OP_DEVICE_SYNC, + prec_type, + {input}, + {output}, + {{"nranks", ModelOpArg(nranks)}}, + name, + &DeviceSyncConfigMap, + -1, + true} {} std::string DeviceSyncOp::function_name(const OpConfig &) const { int nranks; - this->args.get(&nranks, 0); + // this->args.get(&nranks, 0); + this->args["nranks"]; return Op::function_name("ark::comm::device_sync", {{nranks}}); } -OpArgs DeviceSyncOp::function_call_args(const OpConfig &) const { return {}; } +std::vector DeviceSyncOp::function_call_args( + const OpConfig &) const { + return {}; +} Tensor *Model::device_sync(Tensor *input, int nranks, const std::string &name) { DeviceSyncOp op{"none", input, input, nranks, name}; diff --git a/ark/ops/ops_embedding.cc b/ark/ops_old/ops_embedding.cc similarity index 100% rename from ark/ops/ops_embedding.cc rename to ark/ops_old/ops_embedding.cc diff --git a/ark/ops/ops_embedding_test.cc b/ark/ops_old/ops_embedding_test.cc similarity index 100% rename from ark/ops/ops_embedding_test.cc rename to ark/ops_old/ops_embedding_test.cc diff --git a/ark/ops/ops_identity.cc b/ark/ops_old/ops_identity.cc similarity index 100% rename from ark/ops/ops_identity.cc rename to ark/ops_old/ops_identity.cc diff --git a/ark/ops/ops_identity_test.cc b/ark/ops_old/ops_identity_test.cc similarity index 100% rename from ark/ops/ops_identity_test.cc rename to ark/ops_old/ops_identity_test.cc diff --git a/ark/ops/ops_im2col.cc b/ark/ops_old/ops_im2col.cc similarity index 100% rename from ark/ops/ops_im2col.cc rename to ark/ops_old/ops_im2col.cc diff --git a/ark/ops/ops_im2col_test.cc b/ark/ops_old/ops_im2col_test.cc similarity index 100% rename from ark/ops/ops_im2col_test.cc rename to ark/ops_old/ops_im2col_test.cc diff --git a/ark/ops/ops_layernorm.cc b/ark/ops_old/ops_layernorm.cc similarity index 100% rename from ark/ops/ops_layernorm.cc rename to ark/ops_old/ops_layernorm.cc diff --git a/ark/ops/ops_layernorm_test.cc b/ark/ops_old/ops_layernorm_test.cc similarity index 100% rename from ark/ops/ops_layernorm_test.cc rename to ark/ops_old/ops_layernorm_test.cc diff --git a/ark/ops/ops_math.cc b/ark/ops_old/ops_math.cc similarity index 100% rename from ark/ops/ops_math.cc rename to ark/ops_old/ops_math.cc diff --git a/ark/ops/ops_math_test.cc b/ark/ops_old/ops_math_test.cc similarity index 100% rename from ark/ops/ops_math_test.cc rename to ark/ops_old/ops_math_test.cc diff --git a/ark/ops/ops_matmul.cc b/ark/ops_old/ops_matmul.cc similarity index 100% rename from ark/ops/ops_matmul.cc rename to ark/ops_old/ops_matmul.cc diff --git a/ark/ops/ops_matmul_test.cu b/ark/ops_old/ops_matmul_test.cu similarity index 100% rename from ark/ops/ops_matmul_test.cu rename to ark/ops_old/ops_matmul_test.cu diff --git a/ark/ops/ops_max_pool.cc b/ark/ops_old/ops_max_pool.cc similarity index 100% rename from ark/ops/ops_max_pool.cc rename to ark/ops_old/ops_max_pool.cc diff --git a/ark/ops/ops_packet.cc b/ark/ops_old/ops_packet.cc similarity index 100% rename from ark/ops/ops_packet.cc rename to ark/ops_old/ops_packet.cc diff --git a/ark/ops/ops_reduce.cc b/ark/ops_old/ops_reduce.cc similarity index 84% rename from ark/ops/ops_reduce.cc rename to ark/ops_old/ops_reduce.cc index f75cf29e3..5be2e2ce3 100644 --- a/ark/ops/ops_reduce.cc +++ b/ark/ops_old/ops_reduce.cc @@ -57,18 +57,18 @@ std::string ReduceOp::function_name(const OpConfig &cfg, outshape.insert(axis, 1); } - Dims unit_out_dims{1, 1, tile_out.x, tile_out.y}; - return Op::function_name("ark::reduce_" + type, - {{ - input->ldims.dims4(), // InDims - input->shape.dims4(), // InShape - outdims.dims4(), // OutDims - outshape.dims4(), // OutShape - unit_out_dims, // UnitOutDims - cfg.num_warps, // NumWarps - cfg.smem_bytes, // SmemBytes - axis, // Axis - }}); + return Op::function_name( + "ark::reduce_" + type, + {{ + {"InDims", input->ldims.dims4()}, + {"InShape", input->shape.dims4()}, + {"OutDims", outdims.dims4()}, + {"OutShape", outshape.dims4()}, + {"UnitOutDims", {1, 1, tile_out.x, tile_out.y}}, + {"NumWarps", cfg.num_warps}, + {"SmemBytes", cfg.smem_bytes}, + {"Axis", axis}, + }}); } extern const OpConfigMap ReduceWConfigMap; @@ -77,8 +77,14 @@ extern const OpConfigMap Broadcast1ConfigMap; ReduceWSumOp::ReduceWSumOp(const std::string &prec_type, Tensor *input, Tensor *output, int axis, bool keepdims, const std::string &name) - : ReduceOp{OP_REDUCE_W_SUM, prec_type, {input}, {output}, - {{axis, keepdims}}, name, &ReduceWConfigMap, -1} {} + : ReduceOp{OP_REDUCE_W_SUM, + prec_type, + {input}, + {output}, + {{{"axis", axis}, {"keepdims", keepdims}}}, + name, + &ReduceWConfigMap, + -1} {} std::string ReduceWSumOp::function_name(const OpConfig &cfg) const { return ReduceOp::function_name(cfg, "w_sum"); @@ -91,7 +97,7 @@ ReduceESumOp::ReduceESumOp(const std::string &prec_type, Tensor *input, prec_type, {input}, {output}, - {{axis, keepdims}}, + {{{"axis", axis}, {"keepdims", keepdims}}}, name, &Broadcast1ConfigMap, -1} {} @@ -103,8 +109,14 @@ std::string ReduceESumOp::function_name(const OpConfig &cfg) const { ReduceWMaxOp::ReduceWMaxOp(const std::string &prec_type, Tensor *input, Tensor *output, int axis, bool keepdims, const std::string &name) - : ReduceOp{OP_REDUCE_W_MAX, prec_type, {input}, {output}, - {{axis, keepdims}}, name, &ReduceWConfigMap, -1} {} + : ReduceOp{OP_REDUCE_W_MAX, + prec_type, + {input}, + {output}, + {{{"axis", axis}, {"keepdims", keepdims}}}, + name, + &ReduceWConfigMap, + -1} {} std::string ReduceWMaxOp::function_name(const OpConfig &cfg) const { return ReduceOp::function_name(cfg, "w_max"); @@ -117,7 +129,7 @@ ReduceEMaxOp::ReduceEMaxOp(const std::string &prec_type, Tensor *input, prec_type, {input}, {output}, - {{axis, keepdims}}, + {{{"axis", axis}, {"keepdims", keepdims}}}, name, &Broadcast1ConfigMap, -1} {} @@ -129,8 +141,14 @@ std::string ReduceEMaxOp::function_name(const OpConfig &cfg) const { ReduceWMeanOp::ReduceWMeanOp(const std::string &prec_type, Tensor *input, Tensor *output, int axis, bool keepdims, const std::string &name) - : ReduceOp{OP_REDUCE_W_MEAN, prec_type, {input}, {output}, - {{axis, keepdims}}, name, &ReduceWConfigMap, -1} {} + : ReduceOp{OP_REDUCE_W_MEAN, + prec_type, + {input}, + {output}, + {{{"axis", axis}, {"keepdims", keepdims}}}, + name, + &ReduceWConfigMap, + -1} {} std::string ReduceWMeanOp::function_name(const OpConfig &cfg) const { return ReduceOp::function_name(cfg, "w_mean"); @@ -143,7 +161,7 @@ ReduceEMeanOp::ReduceEMeanOp(const std::string &prec_type, Tensor *input, prec_type, {input}, {output}, - {{axis, keepdims}}, + {{{"axis", axis}, {"keepdims", keepdims}}}, name, &Broadcast1ConfigMap, -1} {} diff --git a/ark/ops/ops_reduce_scatter.cc b/ark/ops_old/ops_reduce_scatter.cc similarity index 90% rename from ark/ops/ops_reduce_scatter.cc rename to ark/ops_old/ops_reduce_scatter.cc index 59fa10145..a4c32d88d 100644 --- a/ark/ops/ops_reduce_scatter.cc +++ b/ark/ops_old/ops_reduce_scatter.cc @@ -18,7 +18,12 @@ ReadAndReduceOp::ReadAndReduceOp(const std::string &prec_type, int rank, int npeers, size_t offset, size_t bytes, const std::string &name) : Op(OP_READ_AND_REDUCE, prec_type, {local_buf}, - {cal_region_local, local_buf}, {{rank, npeers, sid, offset, bytes}}, + {cal_region_local, local_buf}, + {{{"rank", rank}, + {"npeers", npeers}, + {"sid", sid}, + {"offset", offset}, + {"bytes", bytes}}}, name, &ReadAndReduceConfigMap, -1, true) { this->inputs.insert(this->inputs.end(), remote_bufs.begin(), remote_bufs.end()); @@ -32,10 +37,10 @@ std::string ReadAndReduceOp::function_name(const OpConfig &cfg) const { int peer_rank; size_t offset; size_t bytes; - this->args.get(&rank, 0); - this->args.get(&peer_rank, 1); - this->args.get(&offset, 3); - this->args.get(&bytes, 4); + this->args.get("rank", &rank); + this->args.get("npeers", &peer_rank); + this->args.get("offset", &offset); + this->args.get("bytes", &bytes); const OpTile &tile_out = cfg.output_tiles[0]; size_t neles_per_tile = tile_out.x * tile_out.y > dst_buff->shape.size() @@ -69,15 +74,16 @@ OpArgs ReadAndReduceOp::function_call_args(const OpConfig &) const { OpArgs opargs; // read_and_redcue(src_offset...) for (int i = 0; i < get_env().num_ranks_per_host - 1; i++) { + auto name = "src_offset_" + std::to_string(i); if (i < npeers) { CHECK(remote_bufs[i]->buf != nullptr); - opargs.put((size_t)(remote_bufs[i]->buf->get_buf_offset() + - remote_bufs[i]->offset_bytes())); + opargs.put(name, size_t{remote_bufs[i]->buf->get_buf_offset() + + remote_bufs[i]->offset_bytes()}); } else { - opargs.put((size_t)0); + opargs.put(name, size_t{0}); } } - opargs.put(local_buff); + opargs.put("src", local_buff); return opargs; } diff --git a/ark/ops/ops_reduce_scatter_test.cc b/ark/ops_old/ops_reduce_scatter_test.cc similarity index 100% rename from ark/ops/ops_reduce_scatter_test.cc rename to ark/ops_old/ops_reduce_scatter_test.cc diff --git a/ark/ops/ops_reduce_test.cc b/ark/ops_old/ops_reduce_test.cc similarity index 100% rename from ark/ops/ops_reduce_test.cc rename to ark/ops_old/ops_reduce_test.cc diff --git a/ark/ops/ops_reshape.cc b/ark/ops_old/ops_reshape.cc similarity index 100% rename from ark/ops/ops_reshape.cc rename to ark/ops_old/ops_reshape.cc diff --git a/ark/ops/ops_reshape_test.cc b/ark/ops_old/ops_reshape_test.cc similarity index 100% rename from ark/ops/ops_reshape_test.cc rename to ark/ops_old/ops_reshape_test.cc diff --git a/ark/ops/ops_rope.cc b/ark/ops_old/ops_rope.cc similarity index 100% rename from ark/ops/ops_rope.cc rename to ark/ops_old/ops_rope.cc diff --git a/ark/ops/ops_rope_test.cc b/ark/ops_old/ops_rope_test.cc similarity index 100% rename from ark/ops/ops_rope_test.cc rename to ark/ops_old/ops_rope_test.cc diff --git a/ark/ops/ops_scale.cc b/ark/ops_old/ops_scale.cc similarity index 98% rename from ark/ops/ops_scale.cc rename to ark/ops_old/ops_scale.cc index 4c3a3a956..6d2693ee7 100644 --- a/ark/ops/ops_scale.cc +++ b/ark/ops_old/ops_scale.cc @@ -16,7 +16,7 @@ ScaleOp::ScaleOp(const std::string &prec_type, Tensor *input, Tensor *output, prec_type, {input}, {output}, - {{val}}, + {{{"factor", val}}}, name, &Broadcast1ConfigMap, -1, diff --git a/ark/ops/ops_scale_test.cc b/ark/ops_old/ops_scale_test.cc similarity index 100% rename from ark/ops/ops_scale_test.cc rename to ark/ops_old/ops_scale_test.cc diff --git a/ark/ops/ops_sendrecv.cc b/ark/ops_old/ops_sendrecv.cc similarity index 100% rename from ark/ops/ops_sendrecv.cc rename to ark/ops_old/ops_sendrecv.cc diff --git a/ark/ops/ops_sendrecv_test.cc b/ark/ops_old/ops_sendrecv_test.cc similarity index 100% rename from ark/ops/ops_sendrecv_test.cc rename to ark/ops_old/ops_sendrecv_test.cc diff --git a/ark/ops/ops_sharding.cc b/ark/ops_old/ops_sharding.cc similarity index 100% rename from ark/ops/ops_sharding.cc rename to ark/ops_old/ops_sharding.cc diff --git a/ark/ops/ops_tensor.cc b/ark/ops_old/ops_tensor.cc similarity index 100% rename from ark/ops/ops_tensor.cc rename to ark/ops_old/ops_tensor.cc diff --git a/ark/ops/ops_tensor_test.cc b/ark/ops_old/ops_tensor_test.cc similarity index 100% rename from ark/ops/ops_tensor_test.cc rename to ark/ops_old/ops_tensor_test.cc diff --git a/ark/ops/ops_test_common.cc b/ark/ops_old/ops_test_common.cc similarity index 100% rename from ark/ops/ops_test_common.cc rename to ark/ops_old/ops_test_common.cc diff --git a/ark/ops/ops_test_common.h b/ark/ops_old/ops_test_common.h similarity index 100% rename from ark/ops/ops_test_common.h rename to ark/ops_old/ops_test_common.h diff --git a/ark/ops/ops_transpose.cc b/ark/ops_old/ops_transpose.cc similarity index 100% rename from ark/ops/ops_transpose.cc rename to ark/ops_old/ops_transpose.cc diff --git a/ark/ops/ops_transpose_test.cc b/ark/ops_old/ops_transpose_test.cc similarity index 100% rename from ark/ops/ops_transpose_test.cc rename to ark/ops_old/ops_transpose_test.cc diff --git a/ark/random.cc b/ark/random.cpp similarity index 97% rename from ark/random.cc rename to ark/random.cpp index ffa9d2f4a..c4282377f 100644 --- a/ark/random.cc +++ b/ark/random.cpp @@ -9,7 +9,7 @@ #include #define gettid() syscall(SYS_gettid) -#include "include/ark.h" +#include "ark/random.hpp" namespace ark { diff --git a/ark/random.h b/ark/random.h index 6f237d57c..bfa7525d3 100644 --- a/ark/random.h +++ b/ark/random.h @@ -6,13 +6,6 @@ namespace ark { -/// Generate a random value. -template -T rand(float min_val, float max_val) { - int mid = RAND_MAX / 2; - return T((ark::rand() - mid) / (float)mid * (max_val - min_val) + min_val); -} - /// Generate a random alpha-numeric string. /// @param len Length of the string /// @return A random alpha-numeric string diff --git a/ark/range.h b/ark/range.hpp similarity index 80% rename from ark/range.h rename to ark/range.hpp index 8788d720a..95f085e49 100644 --- a/ark/range.h +++ b/ark/range.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef ARK_RANGE_H_ -#define ARK_RANGE_H_ +#ifndef ARK_RANGE_HPP_ +#define ARK_RANGE_HPP_ namespace ark { @@ -51,12 +51,24 @@ class Range { T step_; }; + bool operator==(const Range &other) const { + return begin_ == other.begin_ && end_ == other.end_ && + step_ == other.step_; + } + + bool operator<(const Range &other) const { + return begin_ < other.begin_ || end_ < other.end_ || + step_ < other.step_; + } + Iterator begin() const { return Iterator(begin_, begin_, end_, step_); } Iterator end() const { return Iterator(end_, begin_, end_, step_); } T step() const { return step_; } + T size() const { return (end_ - begin_) / step_; } + private: T begin_; T end_; @@ -80,4 +92,4 @@ Range range(T begin, T end, T step) { } // namespace ark -#endif // ARK_RANGE_H_ +#endif // ARK_RANGE_HPP_ diff --git a/ark/range_test.cc b/ark/range_test.cpp similarity index 96% rename from ark/range_test.cc rename to ark/range_test.cpp index a170cc699..32dcb73db 100644 --- a/ark/range_test.cc +++ b/ark/range_test.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "range.h" +#include "range.hpp" #include "unittest/unittest_utils.h" @@ -50,7 +50,6 @@ ark::unittest::State test_range() { } int main() { - ark::init(); UNITTEST(test_range); return 0; } diff --git a/ark/sched/sched.h b/ark/sched/sched.h index eeab05ae3..f4048a8bf 100644 --- a/ark/sched/sched.h +++ b/ark/sched/sched.h @@ -4,10 +4,9 @@ #ifndef ARK_SCHED_H_ #define ARK_SCHED_H_ -#include "include/ark.h" +#include "ark/schedule.hpp" #include "sched/sched_codegen.h" #include "sched/sched_stream.h" -#include "schedule/schedule.h" namespace ark { diff --git a/ark/sched/sched_codegen.cc b/ark/sched/sched_codegen.cc deleted file mode 100644 index 734ab8991..000000000 --- a/ark/sched/sched_codegen.cc +++ /dev/null @@ -1,423 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include "sched/sched_codegen.h" - -#include - -#include -#include -#include -#include - -#include "env.h" -#include "logging.h" -#include "math_utils.h" - -#define OP_PREFIX "op" -#define UNIT_OP_PREFIX "uop" -#define ALLOW_FOR_LOOP 1 - -namespace ark { - -CodeGenerator::CodeGenerator(const GpuManager::Info &gpu_info_, - int num_warps_per_sm_) - : gpu_info{gpu_info_}, - sm_num{gpu_info_.num_sm}, - num_warps_per_sm{num_warps_per_sm_}, - num_indent{0} {} - -size_t CodeGenerator::get_tensor_offset(const Tensor *tensor) const { - size_t off = tensor->buf->get_buf_offset(); - assert(off % 8 == 0); - return off + tensor->offset_bytes(); -} - -std::ostream &CodeGenerator::def_remote_buf(std::ostream &os, - int remote_rank) const { - os << "__device__ char *" ARK_BUF_NAME << remote_rank << ";\n"; - return os; -} - -std::ostream &CodeGenerator::sync_gpu(std::ostream &os) const { - os << "ark::sync_gpu<" << this->sm_num << ">(" ARK_LSS_NAME ");\n"; - return os; -} - -std::ostream &CodeGenerator::def_sync_stream(std::ostream &os, - int stream_id) const { - os << "__device__ ark::sync::State " ARK_LSS_NAME "_" << stream_id << ";\n"; - return os; -} - -std::ostream &CodeGenerator::sync_stream(std::ostream &os, int stream_id, - int sm_id_begin, int sm_id_end) const { - if (sm_id_begin >= sm_id_end) { - ERR(SchedulerError, "invalid SM range"); - } - if (sm_id_begin == 0) { - os << "if (blockIdx.x < " << sm_id_end << ") {"; - } else if (sm_id_begin + 1 == sm_id_end) { - os << "if (blockIdx.x == " << sm_id_begin << ") {"; - } else { - os << "if (blockIdx.x >= " << sm_id_begin << " && blockIdx.x < " - << sm_id_end << ") {"; - } - os << " ark::sync_gpu<" << sm_id_end - sm_id_begin << ">(" ARK_LSS_NAME "_" - << stream_id << "); }\n"; - return os; -} - -std::ostream &CodeGenerator::tensor(std::ostream &os, - const Tensor *tensor) const { - size_t off = this->get_tensor_offset(tensor); - os << "(" << tensor->type.type_str() << " *)"; - std::string buf_name; - if (tensor->imported_rank >= 0) { - buf_name = ARK_BUF_NAME + std::to_string(tensor->imported_rank); - } else { - buf_name = "_buf"; - } - os << "&" << buf_name << "[" << off << "]"; - return os; -} - -std::ostream &CodeGenerator::def_oparg(std::ostream &os, const OpArg &arg, - const std::string &name) const { - if (arg.type == OP_ARG_TENSOR) { - Tensor *tns; - arg.get(&tns); - os << tns->type.type_str() << " *" << name; - } else if (arg.type == OP_ARG_FLOAT) { - os << "float " << name; - } else if (arg.type == OP_ARG_INT) { - os << "int " << name; - } else if (arg.type == OP_ARG_BOOL) { - os << "bool " << name; - } else if (arg.type == OP_ARG_INT64) { - os << "long long int " << name; - } else if (arg.type == OP_ARG_UINT64) { - os << "uint64_t " << name; - } else { - ERR(SchedulerError, "Not implemented"); - } - return os; -} - -std::ostream &CodeGenerator::oparg(std::ostream &os, const OpArg &arg) const { - if (arg.type == OP_ARG_TENSOR) { - Tensor *tns; - arg.get(&tns); - this->tensor(os, tns); - } else if (arg.type == OP_ARG_FLOAT) { - float val; - arg.get(&val); - os << val; - } else if (arg.type == OP_ARG_INT) { - int val; - arg.get(&val); - os << val; - } else if (arg.type == OP_ARG_BOOL) { - bool val; - arg.get(&val); - os << val; - } else if (arg.type == OP_ARG_INT64) { - long long int val; - arg.get(&val); - os << val; - } else if (arg.type == OP_ARG_UINT64) { - uint64_t val; - arg.get(&val); - os << val; - } else { - ERR(SchedulerError, "Not implemented"); - } - return os; -} - -std::ostream &CodeGenerator::branch(std::ostream &os, const Branch &br, - int prev_sm_id_end) const { - if (br.warp_branches.empty()) { - return os; - } - if (prev_sm_id_end < 0) { - prev_sm_id_end = this->sm_num; - } - if (br.sm_id_begin == 0) { - if (br.sm_id_end == this->sm_num) { - os << "\n { // for all SMs"; - } else { - os << "\n if (blockIdx.x < " << br.sm_id_end << ") {"; - } - } else if (br.sm_id_begin == prev_sm_id_end) { - if (br.sm_id_end == this->sm_num) { - os << " else {"; - } else { - os << " else if (blockIdx.x < " << br.sm_id_end << ") {"; - } - } else if (br.sm_id_begin < prev_sm_id_end) { - if (br.sm_id_begin == br.sm_id_end) { - os << "\n if (blockIdx.x == " << br.sm_id_begin << ") {"; - } else { - os << "\n if (blockIdx.x >= " << br.sm_id_begin - << " && blockIdx.x < " << br.sm_id_end << ") {"; - } - } else { - if (br.sm_id_begin == br.sm_id_end) { - os << " else if (blockIdx.x == " << br.sm_id_begin << ") {"; - } else { - os << " else if (blockIdx.x >= " << br.sm_id_begin - << " && blockIdx.x < " << br.sm_id_end << ") {"; - } - } - - int tpw = this->gpu_info.threads_per_warp; - - for (auto &warp_branch : br.warp_branches) { - if (warp_branch.branch_ops.empty()) continue; - int thread_begin = warp_branch.warp_id_begin * tpw; - int thread_end = warp_branch.warp_id_end * tpw; - if (warp_branch.warp_id_begin == 0) { - if (warp_branch.warp_id_end == this->num_warps_per_sm) { - os << "\n { // for all threads\n"; - } else { - os << "\n if (threadIdx.x < " << thread_end << ") {\n"; - } - } else { - os << "\n if (threadIdx.x >= " << thread_begin - << " && threadIdx.x < " << thread_end << ") {\n"; - } - - int num_warps = warp_branch.warp_id_end - warp_branch.warp_id_begin; - - auto get_indexing = [&](int num_warps_per_uop) -> std::string { - int num_uops = num_warps / num_warps_per_uop; - int num_threads_per_uop = num_warps_per_uop * tpw; - std::stringstream thread_indexing; - if (thread_end - thread_begin > num_threads_per_uop) { - if (thread_begin > 0) { - thread_indexing << "((threadIdx.x - " << thread_begin - << ")"; - } else { - thread_indexing << "(threadIdx.x"; - } - if (math::is_pow2(num_threads_per_uop)) { - thread_indexing << " >> " - << math::ilog2(num_threads_per_uop) << ")"; - } else { - thread_indexing << " / " << num_threads_per_uop << ")"; - } - } - auto thread_indexing_str = thread_indexing.str(); - - std::stringstream sm_indexing; - if (br.sm_id_end - br.sm_id_begin > 1) { - if (br.sm_id_begin > 0) { - sm_indexing << "((blockIdx.x - " << br.sm_id_begin << ")"; - } else { - sm_indexing << "(blockIdx.x"; - } - if (num_uops > 1) { - sm_indexing << " * " << num_uops; - } - sm_indexing << ")"; - } - auto sm_indexing_str = sm_indexing.str(); - - std::string indexing; - if (thread_indexing_str.empty()) { - indexing = sm_indexing_str; - } else if (sm_indexing_str.empty()) { - indexing = thread_indexing_str; - } else { - indexing = - "(" + sm_indexing_str + " + " + thread_indexing_str + ")"; - } - return indexing; - }; - - auto uop_code = [&](int opseq_id, int uop_id_diff, - int num_warps_per_uop, - const std::string &uop_id_begin) -> std::string { - // num_uops = (warp_id_end - warp_id_begin) / num_warps_per_uop; - // warp_idx = warp_id - warp_id_begin; - // sm_idx = sm_id - sm_id_begin; - // uop = uop_id_diff * (warp_idx / num_warps_per_uop + - // num_uops * sm_idx) + uop_id_begin; - std::stringstream ss; - ss << OP_PREFIX << opseq_id << "(_buf, "; - if (uop_id_diff != 0) { - auto indexing = get_indexing(num_warps_per_uop); - if (!indexing.empty()) { - if (uop_id_diff != 1) { - ss << uop_id_diff << " * "; - } - ss << indexing << " + "; - } - } - ss << uop_id_begin << ", " << br.smem_bytes_per_warp << ");"; - return ss.str(); - }; - - if (ALLOW_FOR_LOOP == 0 || warp_branch.branch_ops.size() < 3) { - for (auto &branch_op : warp_branch.branch_ops) { - os << " " - << uop_code(branch_op.opseq_id, branch_op.uop_id_diff, - branch_op.num_warps_per_uop, - std::to_string(branch_op.uop_id_begin)) - << "\n"; - } - } else { - size_t idx = 0; - while (idx < warp_branch.branch_ops.size() - 1) { - int opseq_id = warp_branch.branch_ops[idx].opseq_id; - int num_warps_per_uop = - warp_branch.branch_ops[idx].num_warps_per_uop; - int uop_id_diff = warp_branch.branch_ops[idx].uop_id_diff; - int uop_id_begin = warp_branch.branch_ops[idx].uop_id_begin; - int uop_id_begin_diff = - warp_branch.branch_ops[idx + 1].uop_id_begin - - warp_branch.branch_ops[idx].uop_id_begin; - size_t idx2 = idx + 1; - for (; idx2 < warp_branch.branch_ops.size(); ++idx2) { - auto &branch_op = warp_branch.branch_ops[idx2]; - if (branch_op.opseq_id != opseq_id || - branch_op.num_warps_per_uop != num_warps_per_uop || - branch_op.uop_id_diff != uop_id_diff || - branch_op.uop_id_begin != - (int)(uop_id_begin + - uop_id_begin_diff * (idx2 - idx))) { - break; - } - } - if (idx2 - idx > 2) { - os << " for (int _i = " << uop_id_begin << "; _i < " - << uop_id_begin + (idx2 - idx) * uop_id_begin_diff - << "; _i += " << uop_id_begin_diff << ") { " - << uop_code(opseq_id, uop_id_diff, num_warps_per_uop, - "_i") - << " }\n"; - idx = idx2; - } else { - os << " " - << uop_code(opseq_id, uop_id_diff, num_warps_per_uop, - std::to_string(uop_id_begin)) - << "\n"; - ++idx; - } - } - if (idx < warp_branch.branch_ops.size()) { - auto &branch_op = warp_branch.branch_ops[idx]; - os << " " - << uop_code(branch_op.opseq_id, branch_op.uop_id_diff, - branch_op.num_warps_per_uop, - std::to_string(branch_op.uop_id_begin)) - << "\n"; - } - } - os << " }\n"; - } - os << " }\n"; - return os; -} - -std::ostream &CodeGenerator::def_uop(std::ostream &os, const SchedOp &sop, - int uop_id) const { - std::string uop_name = UNIT_OP_PREFIX + std::to_string(uop_id); - std::string func_name = sop.function_name(); - assert(!func_name.empty()); - - const Op *op = sop.get_op(); - if (op->force_inline) { - os << "DEVICE "; - } else { - os << "__noinline__ __device__ "; - } - os << "void " << uop_name << "("; - - OpArgs call_args = op->function_call_args(*sop.get_cfg()); - int cnt_param = 0; - for (const OpArg &arg : call_args.get_args()) { - this->def_oparg(os, arg, "_" + std::to_string(cnt_param)) << ", "; - ++cnt_param; - } - - os << "int _uop_idx, int _smem_per_warp) {\n"; - os << " " << func_name << "("; - - for (int i = 0; i < cnt_param; ++i) { - os << '_' << i << ", "; - } - os << "_uop_idx, _smem_per_warp);\n}\n"; - return os; -} - -std::ostream &CodeGenerator::uop(std::ostream &os, int uop_id) const { - os << UNIT_OP_PREFIX << uop_id; - return os; -} - -// -std::ostream &CodeGenerator::opseq(std::ostream &os, const std::string &name, - const SchedOpSeq &opseq, - std::map &uop_map) const { - auto &sched_ops = opseq.get_sched_ops(); - unsigned int idx = sched_ops.size(); - for (auto &sop : sched_ops) { - if (sop.is_virtual()) { - continue; - } - if (idx == sched_ops.size()) { - os << "// tile dims: (" << opseq.get_tdims()[0] << ", " - << opseq.get_tdims()[1] << ", " << opseq.get_tdims()[2] << ")\n" - << "__noinline__ __device__ void " << name - << "(char *_buf, int _uop_idx, int _smem_per_warp) {\n"; - } - --idx; - os << " "; - auto uop_map_it = uop_map.find(sop.serialize()); - if (uop_map_it != uop_map.end()) { - this->uop(os, uop_map_it->second); - } else { - os << sop.function_name(); - } - os << '('; - - OpArgs call_args = sop.get_op()->function_call_args(*sop.get_cfg()); - for (const OpArg &arg : call_args.get_args()) { - this->oparg(os, arg) << ", "; - } - - os << "_uop_idx, _smem_per_warp);\n"; - } - if (idx != sched_ops.size()) { - os << "}\n"; - } - return os; -} - -std::ostream &CodeGenerator::def_proxy_channels(std::ostream &os, - size_t num_channels) const { - if (num_channels == 0) { - return os; - } - os << "#include \n" - "__constant__ mscclpp::SimpleProxyChannelDeviceHandle " - "_ARK_PROXY_CHANS[" - << num_channels << "];\n"; - return os; -} - -std::ostream &CodeGenerator::def_sm_channels(std::ostream &os, - size_t num_channels) const { - if (num_channels == 0) { - return os; - } - os << "#include \n" - "__constant__ mscclpp::SmChannelDeviceHandle " - "_ARK_SM_CHANS[" - << num_channels << "];\n"; - return os; -} - -} // namespace ark diff --git a/ark/sched/sched_codegen.h b/ark/sched/sched_codegen.h deleted file mode 100644 index 91577e672..000000000 --- a/ark/sched/sched_codegen.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#ifndef ARK_SCHED_CODEGEN_H_ -#define ARK_SCHED_CODEGEN_H_ - -#include - -#include "gpu/gpu_loop_kernel.h" -#include "sched/sched_op.h" -#include "sched/sched_opseq.h" -#include "sched_branch.h" - -namespace ark { - -class CodeGenerator { - public: - CodeGenerator(const GpuManager::Info &gpu_info_, int num_warps_per_sm_); - - std::ostream &def_remote_buf(std::ostream &os, int remote_rank) const; - - std::ostream &sync_gpu(std::ostream &os) const; - - std::ostream &def_sync_stream(std::ostream &os, int stream_id) const; - std::ostream &sync_stream(std::ostream &os, int stream_id, int sm_id_begin, - int sm_id_end) const; - - std::ostream &tensor(std::ostream &os, const Tensor *tensor) const; - - std::ostream &def_oparg(std::ostream &os, const OpArg &arg, - const std::string &name) const; - std::ostream &oparg(std::ostream &os, const OpArg &arg) const; - - std::ostream &branch(std::ostream &os, const Branch &branch, - int prev_sm_id_end = -1) const; - - std::ostream &def_uop(std::ostream &os, const SchedOp &sop, - int uop_id) const; - - std::ostream &uop(std::ostream &os, int uop_id) const; - - std::ostream &opseq(std::ostream &os, const std::string &name, - const SchedOpSeq &opseq, - std::map &uop_map) const; - - std::ostream &def_proxy_channels(std::ostream &os, - size_t num_channels) const; - - std::ostream &def_sm_channels(std::ostream &os, size_t num_channels) const; - - protected: - size_t get_tensor_offset(const Tensor *tensor) const; - - const GpuManager::Info &gpu_info; - int sm_num; - int num_warps_per_sm; - int world_size; - int num_indent; -}; - -} // namespace ark - -#endif // ARK_SCHED_CODEGEN_H_ diff --git a/ark/sched/sched_op.cc b/ark/sched/sched_op.cc index a6c4603e9..151961c2d 100644 --- a/ark/sched/sched_op.cc +++ b/ark/sched/sched_op.cc @@ -38,7 +38,8 @@ const string SchedOp::serialize() const { ss << this->function_name() << ","; OpArgs call_args = this->get_op()->function_call_args(*(this->get_cfg())); - for (const OpArg &arg : call_args.get_args()) { + for (const auto &p : call_args.get_args()) { + const OpArg &arg = p.second; if (arg.type == OP_ARG_TENSOR) { Tensor *tns; arg.get(&tns); diff --git a/ark/sched/sched_op.h b/ark/sched/sched_op.h index 196df0fc2..19eb5dff7 100644 --- a/ark/sched/sched_op.h +++ b/ark/sched/sched_op.h @@ -4,9 +4,6 @@ #ifndef ARK_SCHED_OP_H_ #define ARK_SCHED_OP_H_ -#include "include/ark.h" -#include "ops/ops_common.h" - namespace ark { class SchedOp { diff --git a/ark/schedule/schedule.cc b/ark/schedule/schedule.cpp similarity index 98% rename from ark/schedule/schedule.cc rename to ark/schedule/schedule.cpp index 1f5febc02..9ae7cd5aa 100644 --- a/ark/schedule/schedule.cc +++ b/ark/schedule/schedule.cpp @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include "schedule.h" +#include "ark/schedule.hpp" #include "nlohmann/json.hpp" diff --git a/ark/unique_list.hpp b/ark/unique_list.hpp new file mode 100644 index 000000000..6a08822a5 --- /dev/null +++ b/ark/unique_list.hpp @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef ARK_UNIQUE_LIST_HPP_ +#define ARK_UNIQUE_LIST_HPP_ + +#include +#include +#include +#include + +namespace ark { + +template +class UniqueList { + private: + std::list list_; + std::map::iterator> index_; + + public: + UniqueList() = default; + + explicit UniqueList(const std::vector &vec) { + for (const auto &value : vec) { + push_back(value); + } + } + + UniqueList(const UniqueList &other) = default; + + UniqueList(UniqueList &&other) = default; + + UniqueList &operator=(const UniqueList &other) = default; + + UniqueList &operator=(UniqueList &&other) = default; + + const T &front() const { return list_.front(); } + + const T &back() const { return list_.back(); } + + const T &operator[](size_t idx) const { + auto it = list_.begin(); + std::advance(it, idx); + return *it; + } + + void push_back(const T &value) { + auto it = index_.find(value); + if (it == index_.end()) { + list_.push_back(value); + index_[value] = --list_.end(); + } + } + + void erase(const T &value) { + auto it = index_.find(value); + if (it != index_.end()) { + list_.erase(it->second); + index_.erase(it); + } + } + + void erase(typename std::list::iterator it) { + index_.erase(*it); + list_.erase(it); + } + + void clear() { + list_.clear(); + index_.clear(); + } + + size_t index(const T &value) const { + auto it = index_.find(value); + return (it == index_.end()) + ? -1 + : std::distance( + list_.begin(), + static_cast::const_iterator>( + it->second)); + } + + typename std::list::iterator begin() { return list_.begin(); } + + typename std::list::const_iterator begin() const { + return list_.begin(); + } + + typename std::list::iterator end() { return list_.end(); } + + typename std::list::const_iterator end() const { return list_.end(); } + + typename std::list::iterator find(const T &value) { + auto it = index_.find(value); + return (it == index_.end()) ? end() : it->second; + } + + typename std::list::const_iterator find(const T &value) const { + auto it = index_.find(value); + return (it == index_.end()) ? end() : it->second; + } + + bool empty() const { return list_.empty(); } + + bool contains(const T &value) const { + return index_.find(value) != index_.end(); + } + + size_t size() const { return index_.size(); } +}; + +} // namespace ark + +#endif // ARK_UNIQUE_LIST_HPP_ diff --git a/ark/unique_list_test.cpp b/ark/unique_list_test.cpp new file mode 100644 index 000000000..6df6d478e --- /dev/null +++ b/ark/unique_list_test.cpp @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "unique_list.hpp" + +#include "unittest/unittest_utils.h" + +ark::unittest::State test_unique_list() { + ark::UniqueList list; + list.push_back(1); + list.push_back(2); + list.push_back(3); + list.push_back(1); + list.push_back(2); + list.push_back(3); + UNITTEST_EQ(list.size(), 3); + UNITTEST_EQ(list[0], 1); + UNITTEST_EQ(list[1], 2); + UNITTEST_EQ(list[2], 3); + + list.clear(); + UNITTEST_EQ(list.size(), 0); + + list.push_back(1); + list.push_back(2); + list.push_back(3); + list.push_back(1); + list.push_back(2); + list.push_back(3); + list.push_back(4); + UNITTEST_EQ(list.size(), 4); + UNITTEST_EQ(list[0], 1); + UNITTEST_EQ(list[1], 2); + UNITTEST_EQ(list[2], 3); + UNITTEST_EQ(list[3], 4); + + list.clear(); + UNITTEST_EQ(list.size(), 0); + + list.push_back(1); + list.push_back(2); + list.push_back(3); + + list.erase(1); + UNITTEST_EQ(list.size(), 2); + UNITTEST_EQ(list[0], 2); + UNITTEST_EQ(list[1], 3); + + list.clear(); + UNITTEST_EQ(list.size(), 0); + + list.push_back(1); + list.push_back(2); + list.push_back(3); + + list.erase(0); + UNITTEST_EQ(list.size(), 3); + UNITTEST_EQ(list[0], 1); + UNITTEST_EQ(list[1], 2); + UNITTEST_EQ(list[2], 3); + + list.clear(); + UNITTEST_EQ(list.size(), 0); + + list.push_back(1); + list.push_back(2); + list.push_back(3); + + list.erase(2); + UNITTEST_EQ(list.size(), 2); + UNITTEST_EQ(list[0], 1); + UNITTEST_EQ(list[1], 3); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_unique_list); + return 0; +} diff --git a/ark/unittest/unittest_utils.cc b/ark/unittest/unittest_utils.cpp similarity index 76% rename from ark/unittest/unittest_utils.cc rename to ark/unittest/unittest_utils.cpp index 56313647e..c3aa45056 100644 --- a/ark/unittest/unittest_utils.cc +++ b/ark/unittest/unittest_utils.cpp @@ -13,8 +13,6 @@ #include "file_io.h" #include "logging.h" -using namespace std; - // Grep SIGALRM and exit. static void sigalrm_timeout_handler(int) { signal(SIGALRM, SIG_IGN); @@ -26,8 +24,8 @@ namespace unittest { // Temporal unittest states. struct TempStates { - vector pids; - vector threads; + std::vector pids; + std::vector threads; }; TempStates GLOBAL_TEMP_STATES_; @@ -45,15 +43,15 @@ Timeout::~Timeout() { } // Spawn a thread that runs the given function. -thread *spawn_thread(function func) { - thread *t = new thread(func); +std::thread *spawn_thread(std::function func) { + std::thread *t = new std::thread(func); GLOBAL_TEMP_STATES_.threads.emplace_back(t); return t; } // Wait for all threads to finish. void wait_all_threads() { - for (thread *t : GLOBAL_TEMP_STATES_.threads) { + for (std::thread *t : GLOBAL_TEMP_STATES_.threads) { if (t->joinable()) { t->join(); } @@ -63,7 +61,7 @@ void wait_all_threads() { } // Spawn a process that runs the given function. -int spawn_process(function func) { +int spawn_process(std::function func) { pid_t pid = fork(); if (pid < 0) { UNITTEST_UEXIT("fork() failed"); @@ -89,19 +87,19 @@ void wait_all_processes() { } while (!WIFEXITED(status)); status = WEXITSTATUS(status); if (status != State::SUCCESS) { - UNITTEST_EXIT((State)status, "process " + to_string(pid)); + UNITTEST_EXIT((State)status, "process " + std::to_string(pid)); } } GLOBAL_TEMP_STATES_.pids.clear(); } // Run the given test function. -State test(function test_func) { return test_func(); } +State test(std::function test_func) { return test_func(); } // -string get_kernel_code(const string &name) { - return ark::read_file(ark::get_dir(string{__FILE__}) + "/../ops/kernels/" + - name + ".h"); +std::string get_kernel_code(const std::string &name) { + return ark::read_file(ark::get_dir(std::string{__FILE__}) + + "/../ops/kernels/" + name + ".h"); } } // namespace unittest diff --git a/ark/unittest/unittest_utils.h b/ark/unittest/unittest_utils.h index 44680f532..2b433f7fe 100644 --- a/ark/unittest/unittest_utils.h +++ b/ark/unittest/unittest_utils.h @@ -11,8 +11,8 @@ #include #include +#include "ark/init.hpp" #include "cpu_timer.h" -#include "include/ark.h" #include "logging.h" namespace ark { @@ -48,6 +48,7 @@ std::string get_kernel_code(const std::string &name); // Run the given test function. #define UNITTEST(test_func) \ do { \ + ark::init(); \ LOG(ark::INFO, "unittest start: " #test_func); \ double _s = ark::cpu_timer(); \ ark::unittest::State _ret; \ diff --git a/ark/version.cpp b/ark/version.cpp new file mode 100644 index 000000000..f7556d8c4 --- /dev/null +++ b/ark/version.cpp @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/version.hpp" + +#include +#include + +namespace ark { + +std::string version() { + std::stringstream ss; + ss << ARK_MAJOR << "." << ARK_MINOR << "." << ARK_PATCH; + return ss.str(); +} + +} // namespace ark diff --git a/ark/version_test.cpp b/ark/version_test.cpp new file mode 100644 index 000000000..b4a5cb825 --- /dev/null +++ b/ark/version_test.cpp @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include "ark/version.hpp" + +#include "unittest/unittest_utils.h" + +ark::unittest::State test_version() { + auto version = ark::version(); + + // Check if the version string is in the correct format. + auto dot1 = version.find('.'); + auto dot2 = version.find('.', dot1 + 1); + UNITTEST_NE(dot1, std::string::npos); + UNITTEST_NE(dot2, std::string::npos); + + return ark::unittest::SUCCESS; +} + +int main() { + UNITTEST(test_version); + return 0; +} diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py index 22232b1b0..3ce14e8ec 100644 --- a/examples/llama/model_test.py +++ b/examples/llama/model_test.py @@ -428,23 +428,37 @@ def test_transformer_block( low=-1, high=1, size=(batch_size, seq_len, args.dim) ).astype(dtype) - test_module( - module_class_ark=model_ark.TransformerBlock, - module_args_ark=[ - 0, - args, - ark.DataType.from_numpy(dtype), - rank, - world_size, - ], - inputs_ark=[feature, 0, freqs_cis_ark, None], - module_class_pt=model_pt.TransformerBlock, - module_args_pt=[0, args], - inputs_pt=[feature.astype(dtype), 0, freqs_cis, None], - module_name_prefix="layers.0", - rank=rank, - world_size=world_size, - ) + module = model_ark.Attention(args, ark.DataType.from_numpy(dtype), rank, world_size) + # module_inputs = [ + # ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype)) + # if isinstance(i, np.ndarray) + # else i + # for i in inputs + # ] + feature_tensor = ark.tensor(list(feature.shape), ark.DataType.from_numpy(feature.dtype)) + freqs_cis_ark_tensor = ark.tensor(list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)) + output = module(feature_tensor, 0, freqs_cis_ark_tensor, None) + + ark.Model.get_model().create_nodes() + print(ark.Model.get_model().serialize()) + + # test_module( + # module_class_ark=model_ark.TransformerBlock, + # module_args_ark=[ + # 0, + # args, + # ark.DataType.from_numpy(dtype), + # rank, + # world_size, + # ], + # inputs_ark=[feature, 0, freqs_cis_ark, None], + # module_class_pt=model_pt.TransformerBlock, + # module_args_pt=[0, args], + # inputs_pt=[feature.astype(dtype), 0, freqs_cis, None], + # module_name_prefix="layers.0", + # rank=rank, + # world_size=world_size, + # ) def test_transformer( @@ -514,8 +528,8 @@ def test(args, batch_size, seq_len, dtype, rank, world_size): # test_row_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size) # test_column_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size) # test_attention(args, batch_size, seq_len, dtype, rank, world_size) - # test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size) - test_transformer(args, batch_size, seq_len, dtype, rank, world_size) + test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size) + # test_transformer(args, batch_size, seq_len, dtype, rank, world_size) def worker( diff --git a/examples/tutorial/quickstart_tutorial.py b/examples/tutorial/quickstart_tutorial.py index da1894702..981435780 100644 --- a/examples/tutorial/quickstart_tutorial.py +++ b/examples/tutorial/quickstart_tutorial.py @@ -11,9 +11,9 @@ def quickstart_tutorial(): M, N = 64, 64 # Create an input tensor - input_tensor = ark.tensor([M, N], ark.fp32) + input_tensor = ark.tensor([M, N], ark.fp16) # Create another tensor - other_tensor = ark.tensor([M, N], ark.fp32) + other_tensor = ark.tensor([M, N], ark.fp16) # Add the two tensors output_tensor = ark.add(input_tensor, other_tensor) @@ -25,9 +25,9 @@ def quickstart_tutorial(): runtime.launch() # Initialize the input and other tensor with random values - input_tensor_host = np.random.rand(M, N).astype(np.float32) + input_tensor_host = np.random.rand(M, N).astype(np.float16) input_tensor.from_numpy(input_tensor_host) - other_tensor_host = np.random.rand(M, N).astype(np.float32) + other_tensor_host = np.random.rand(M, N).astype(np.float16) other_tensor.from_numpy(other_tensor_host) # Run the ARK program diff --git a/examples/tutorial/sched.json b/examples/tutorial/sched.json new file mode 100644 index 000000000..add17313d --- /dev/null +++ b/examples/tutorial/sched.json @@ -0,0 +1,60 @@ +{ + "NumProcessors": 108, + "NumWarpsPerProcessor": 16, + "ProcessorGroups": [ + { + "ProcessorRange": { + "Begin": 0, + "End": 107 + }, + "ResourceGroups": [ + { + "ProcessorRange": { + "Begin": 0, + "End": 64 + }, + "SramRange": { + "Begin": 0, + "End": 0 + }, + "TaskGroups": [ + { + "TaskId": 0, + "TaskRange": { + "Begin": 0, + "End": 107 + }, + "TaskStride": 1 + } + ], + "WarpRange": { + "Begin": 0, + "End": 1 + } + } + ] + }, + { + "ProcessorRange": { + "Begin": 107, + "End": 108 + }, + "ResourceGroups": null + }, + { + "ProcessorRange": { + "Begin": 0, + "End": 108 + }, + "ResourceGroups": null + } + ], + "TaskInfos": [ + { + "Detail": "ark::add, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 1, 64>, 1, 0>,fp16 *,fp16 *,fp16 *,;", + "Id": 0, + "NumWarps": 1, + "SramBytes": 0 + } + ] +} \ No newline at end of file diff --git a/python/model_py.cpp b/python/model_py.cpp index c0686e254..9052b72b0 100644 --- a/python/model_py.cpp +++ b/python/model_py.cpp @@ -12,6 +12,10 @@ namespace py = pybind11; void register_model(py::module &m) { py::class_(m, "_Model") .def(py::init(), py::arg("rank") = 0) + .def("create_nodes", &ark::Model::create_nodes) + .def("serialize", &ark::Model::serialize, + py::return_value_policy::reference_internal, + py::arg("indent") = -1) .def("tensor", &ark::Model::tensor, "construct a tensor with given shape and data type.", py::return_value_policy::reference_internal, py::arg("shape"),