diff --git a/.vscode/launch.json b/.vscode/launch.json
index d7bf3daf1..92a096002 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -4,7 +4,7 @@
         "name": "ops_cast_test",
         "type": "cppdbg",
         "request": "launch",
-        "program": "${workspaceFolder}/build/ark/ops_cast_test",
+        "program": "${workspaceFolder}/build/ark/executor_test",
         "args": [],
         "stopAtEntry": false,
         "cwd": "${fileDirname}",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 640196a66..1a376c337 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,4 +10,105 @@
     "cmake.ctestArgs": [
         "--verbose"
     ],
+    "files.associations": {
+        "ostream": "cpp",
+        "stdexcept": "cpp",
+        "string": "cpp",
+        "iosfwd": "cpp",
+        "memory": "cpp",
+        "stack": "cpp",
+        "any": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "cfenv": "cpp",
+        "chrono": "cpp",
+        "cinttypes": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "codecvt": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "csignal": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "list": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "future": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ranges": "cpp",
+        "semaphore": "cpp",
+        "span": "cpp",
+        "sstream": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "typeinfo": "cpp",
+        "valarray": "cpp",
+        "variant": "cpp",
+        "__nullptr": "cpp",
+        "__hash_table": "cpp",
+        "__split_buffer": "cpp",
+        "__tree": "cpp",
+        "queue": "cpp",
+        "__locale": "cpp",
+        "*.ipp": "cpp",
+        "strstream": "cpp",
+        "typeindex": "cpp",
+        "locale": "cpp",
+        "__node_handle": "cpp",
+        "__threading_support": "cpp",
+        "__functional_03": "cpp",
+        "filesystem": "cpp",
+        "__bit_reference": "cpp",
+        "__config": "cpp",
+        "__debug": "cpp",
+        "version": "cpp",
+        "__functional_base": "cpp",
+        "__memory": "cpp",
+        "*.ci": "c",
+        "ark_kernels.h": "c"
+    },
+    "cmake.configureOnOpen": true,
 }
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 7c360ee37..f0c52352c 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -1,9 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
-file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc *_test.cu)
-file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cc)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
+file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp)
+file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp)
 list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})
 
 if(USE_ROCM)
diff --git a/ark/bfloat16.cc b/ark/bfloat16.cpp
similarity index 100%
rename from ark/bfloat16.cc
rename to ark/bfloat16.cpp
diff --git a/ark/bfloat16.h b/ark/bfloat16.h
index da57348be..83a1bcb7d 100644
--- a/ark/bfloat16.h
+++ b/ark/bfloat16.h
@@ -144,6 +144,8 @@ struct alignas(2) bfloat16_t {
     int mantissa() const { return int(raw() & 0x7f); }
 };
 
+using bf16 = bfloat16_t;
+
 /// Assignment from half_t
 template <>
 bfloat16_t& bfloat16_t::operator=(bfloat16_t const& x);
diff --git a/ark/bfloat16_test.cc b/ark/bfloat16_test.cpp
similarity index 99%
rename from ark/bfloat16_test.cc
rename to ark/bfloat16_test.cpp
index 9c53deaf3..f64a41d8d 100644
--- a/ark/bfloat16_test.cc
+++ b/ark/bfloat16_test.cpp
@@ -3,7 +3,6 @@
 
 #include "bfloat16.h"
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_bfloat16() {
@@ -246,7 +245,6 @@ ark::unittest::State test_bfloat16_error() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_bfloat16);
     UNITTEST(test_bfloat16_error);
     return 0;
diff --git a/ark/codegen/codegen.cpp b/ark/codegen/codegen.cpp
new file mode 100644
index 000000000..81f60a080
--- /dev/null
+++ b/ark/codegen/codegen.cpp
@@ -0,0 +1,432 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "codegen.hpp"
+
+#include <set>
+
+#include "env.h"
+#include "file_io.h"
+#include "logging.h"
+#include "model/model_data_type.hpp"
+#include "model/model_op.hpp"
+#include "model/model_tensor.hpp"
+#include "nlohmann/json.hpp"
+#include "range.hpp"
+
+static std::string replace(
+    const std::string &template_str,
+    const std::map<std::string, std::string> &replacements) {
+    std::string result = template_str;
+    for (const auto &kv : replacements) {
+        size_t pos = 0;
+        while ((pos = result.find(kv.first, pos)) != std::string::npos) {
+            result.replace(pos, kv.first.length(), kv.second);
+            pos += kv.second.length();
+        }
+    }
+    return result;
+}
+
+namespace ark {
+
+class BufferInfo {
+   public:
+    BufferInfo(size_t id) : id(id), bytes(0), is_input(true), is_output(true) {}
+
+    // ID of this buffer
+    const size_t id;
+
+    // Total bytes of this buffer
+    size_t bytes;
+
+    // True if none of tensors in this buffer is a result tensor or a write
+    // tensor of a non-virtual Op, i.e., this buffer is an input buffer
+    bool is_input;
+
+    // True if none of tensors in this buffer is a read tensor of a non-virtual
+    // Op, i.e., this buffer is an output buffer
+    bool is_output;
+
+    // IDs of tensors in this buffer
+    std::set<size_t> tensor_ids;
+
+    // IDs of tasks that read/write from/to this buffer
+    std::set<size_t> task_ids;
+};
+
+class SyncStateInfo {
+   public:
+    SyncStateInfo() {
+        static size_t next_id = 0;
+        id = next_id++;
+    }
+
+    size_t id;
+};
+
+class CodeGenerator::Impl {
+   public:
+    Impl(const std::string &plan, const std::string &name);
+    ~Impl() = default;
+
+   private:
+    void plan_memory(const nlohmann::json &plan);
+
+    std::string def_op(const nlohmann::json &op_json, size_t task_id,
+                       size_t op_idx);
+
+    std::string def_task(const nlohmann::json &task_json);
+
+    std::string task_seq(size_t proc_b, size_t proc_e, size_t proc_s,
+                         size_t proc_cur, size_t task_b, size_t task_e,
+                         size_t task_s, size_t task_gran, size_t num_slots,
+                         size_t slot_num_warps, size_t slot_sram_bytes,
+                         size_t task_id);
+
+    std::string resource_group(const nlohmann::json &rg_json,
+                               const nlohmann::json &task_infos,
+                               const Range<size_t> &proc_range);
+
+   protected:
+    friend class CodeGenerator;
+
+    std::string name_;
+    size_t num_procs_;
+    size_t num_warps_per_proc_;
+    std::map<size_t, std::shared_ptr<BufferInfo>> buffer_id_to_info_;
+    std::map<size_t, size_t> buffer_id_to_offset_;
+    std::map<size_t, size_t> tensor_id_to_offset_;
+    size_t total_bytes_;
+    std::string code_;
+};
+
+CodeGenerator::Impl::Impl(const std::string &plan, const std::string &name)
+    : name_(name) {
+    auto j = nlohmann::json::parse(plan);
+    this->plan_memory(j);
+
+    num_procs_ = j["NumProcessors"];
+    num_warps_per_proc_ = j["NumWarpsPerProcessor"];
+
+    std::stringstream definitions_ss;
+    for (auto &task_json : j["TaskInfos"]) {
+        definitions_ss << this->def_task(task_json);
+    }
+
+    std::map<Range<size_t>, SyncStateInfo> sync_state_info;
+
+    std::stringstream body_ss;
+    size_t pg_idx = 0;
+    size_t num_pgs = j["ProcessorGroups"].size();
+    for (auto &pg : j["ProcessorGroups"]) {
+        Range<size_t> proc_range(pg["ProcessorRange"][0],
+                                 pg["ProcessorRange"][1]);
+
+        for (auto &rg : pg["ResourceGroups"]) {
+            body_ss << this->resource_group(rg, j["TaskInfos"], proc_range);
+        }
+
+        if (pg_idx + 1 < num_pgs) {
+            // sync pg
+            size_t begin = *proc_range.begin();
+            size_t end = *proc_range.end();
+            if (begin == 0) {
+                body_ss << "  if (blockIdx.x < " << end << ") {";
+            } else if (begin + 1 == end) {
+                body_ss << "  if (blockIdx.x == " << begin << ") {";
+            } else {
+                body_ss << "  if (blockIdx.x >= " << begin
+                        << " && blockIdx.x < " << end << ") {";
+            }
+            size_t state_id = sync_state_info[proc_range].id;
+            body_ss << " sync_gpu<" << end - begin << ">(ARK_LOOP_SYNC_STATE_"
+                    << state_id << "); }\n";
+        }
+    }
+
+    for (auto &kv : sync_state_info) {
+        definitions_ss << "__device__ sync::State ARK_LOOP_SYNC_STATE_"
+                       << kv.second.id << ";\n";
+    }
+
+    const std::string &ark_root = get_env().path_root_dir;
+    const std::string &template_path =
+        ark_root + "/include/kernels/kernel_template.in";
+    std::string template_code = read_file(template_path);
+    std::map<std::string, std::string> replacements = {
+        {"@NUM_BLOCKS@", std::to_string(num_procs_)},
+        {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
+        {"@DEFINITIONS@", definitions_ss.str()},
+        {"@BODY@", body_ss.str()},
+        {"@NAME@", name_},
+    };
+    code_ = replace(template_code, replacements);
+}
+
+void CodeGenerator::Impl::plan_memory(const nlohmann::json &plan) {
+    auto get_or_create_buffer_info = [&](size_t buffer_id) {
+        if (buffer_id_to_info_.find(buffer_id) == buffer_id_to_info_.end()) {
+            auto buf_info = std::make_shared<BufferInfo>(buffer_id);
+            buffer_id_to_info_[buffer_id] = buf_info;
+            return buf_info;
+        }
+        return buffer_id_to_info_[buffer_id];
+    };
+
+    auto tensor_stride_bytes = [](const nlohmann::json &tns) {
+        Dims strides(tns["Strides"].get<std::vector<DimType>>());
+        size_t nelems = strides.size();
+        return nelems * ModelDataT::from_name(tns["DataType"])->bytes();
+    };
+
+    for (auto &task_info : plan["TaskInfos"]) {
+        for (auto &op : task_info["Ops"]) {
+            for (auto &tns : op["ReadTensors"]) {
+                auto buf_info = get_or_create_buffer_info(tns["BufferId"]);
+                buf_info->bytes =
+                    std::max(buf_info->bytes, tensor_stride_bytes(tns));
+                buf_info->is_output = false;
+                buf_info->tensor_ids.insert(tns["Id"].get<size_t>());
+                buf_info->task_ids.insert(task_info["Id"].get<size_t>());
+            }
+            for (auto &tns : op["WriteTensors"]) {
+                auto buf_info = get_or_create_buffer_info(tns["BufferId"]);
+                buf_info->bytes =
+                    std::max(buf_info->bytes, tensor_stride_bytes(tns));
+                buf_info->is_input = false;
+                buf_info->tensor_ids.insert(tns["Id"].get<size_t>());
+                buf_info->task_ids.insert(task_info["Id"].get<size_t>());
+            }
+            for (auto &tns : op["ResultTensors"]) {
+                auto buf_info = get_or_create_buffer_info(tns["BufferId"]);
+                buf_info->bytes =
+                    std::max(buf_info->bytes, tensor_stride_bytes(tns));
+                buf_info->is_input = false;
+                buf_info->tensor_ids.insert(tns["Id"].get<size_t>());
+                buf_info->task_ids.insert(task_info["Id"].get<size_t>());
+            }
+        }
+    }
+
+    // TODO: improve memory planning
+    size_t offset = 0;
+    for (auto &kv : buffer_id_to_info_) {
+        buffer_id_to_offset_[kv.first] = offset;
+        for (auto &tns_id : kv.second->tensor_ids) {
+            tensor_id_to_offset_[tns_id] = offset;
+        }
+        offset += kv.second->bytes;
+    }
+    total_bytes_ = offset;
+}
+
+std::string CodeGenerator::Impl::def_op(const nlohmann::json &op_json,
+                                        size_t task_id, size_t op_idx) {
+    auto op = ModelOp::deserialize(op_json);
+    auto impl_name = op->impl_name(op_json["Config"]);
+    auto impl_args = op->impl_args(op_json["Config"]);
+    std::stringstream ss;
+    ss << "__forceinline__ __device__ void t" << task_id << "_o" << op_idx
+       << "(";
+    size_t arg_idx = 0;
+    for (auto &arg : impl_args) {
+        if (arg.type_name() == "TENSOR") {
+            auto tns = arg.value<ModelTensorRef>();
+            ss << tns->data_type()->type_str() << "*";
+        } else {
+            ss << arg.type_str();
+        }
+        ss << " _" << arg_idx++ << ", ";
+    }
+    ss << "int _idx, int _spw) {\n  " << impl_name << "(";
+    for (size_t i = 0; i < impl_args.size(); ++i) {
+        ss << "_" << i << ", ";
+    }
+    ss << "_idx, _spw);\n}\n";
+    return ss.str();
+}
+
+std::string CodeGenerator::Impl::def_task(const nlohmann::json &task_json) {
+    std::stringstream ss;
+    size_t op_idx = 0;
+    for (auto &op_json : task_json["Ops"]) {
+        ss << this->def_op(op_json, task_json["Id"], op_idx++);
+    }
+    ss << "__noinline__ __device__ void t" << task_json["Id"]
+       << "(char* _buf, int _idx, int _spw) {\n";
+    op_idx = 0;
+    for (auto &op_json : task_json["Ops"]) {
+        auto op = ModelOp::deserialize(op_json);
+        auto impl_args = op->impl_args(op_json["Config"]);
+        ss << "  t" << task_json["Id"] << "_o" << op_idx++ << "(";
+        for (size_t i = 0; i < impl_args.size(); ++i) {
+            auto &arg = impl_args[i];
+            if (arg.type_name() == "TENSOR") {
+                auto tns = arg.value<ModelTensorRef>();
+                ss << "(" << tns->data_type()->type_str() << "*)&_buf["
+                   << tensor_id_to_offset_[tns->id()] << "]";
+            } else {
+                ss << arg.serialize()[arg.type_name()];
+            }
+            ss << ", ";
+        }
+        ss << "_idx, _spw);\n";
+    }
+    ss << "}\n";
+    return ss.str();
+}
+
+std::string CodeGenerator::Impl::task_seq(
+    size_t proc_b, size_t proc_e, size_t proc_s, size_t proc_cur, size_t task_b,
+    size_t task_e, size_t task_s, size_t task_gran, size_t num_slots,
+    size_t slot_num_warps, size_t slot_sram_bytes, size_t task_id) {
+    std::stringstream ss;
+    ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", "
+       << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", "
+       << task_gran << ", " << num_slots << ", " << slot_num_warps << ", "
+       << slot_sram_bytes << ", t" << task_id << ">(_buf);\n";
+    return ss.str();
+}
+
+std::string CodeGenerator::Impl::resource_group(
+    const nlohmann::json &rg_json, const nlohmann::json &task_infos,
+    const Range<size_t> &proc_range) {
+    Range<size_t> rg_proc_range(rg_json["ProcessorRange"][0],
+                                rg_json["ProcessorRange"][1]);
+    if (*rg_proc_range.begin() < *proc_range.begin() ||
+        *rg_proc_range.end() > *proc_range.end()) {
+        ERR(SchedulerError, "invalid processor range of resource group");
+    }
+    Range<size_t> rg_warp_range(rg_json["WarpRange"][0],
+                                rg_json["WarpRange"][1]);
+    Range<size_t> rg_sram_range(rg_json["SramRange"][0],
+                                rg_json["SramRange"][1]);
+    auto warp_iter = rg_warp_range.begin();
+    auto sram_iter = rg_sram_range.begin();
+    size_t total_warps = rg_warp_range.size();
+    size_t total_sram = rg_sram_range.size();
+    size_t proc_cur = *rg_proc_range.begin();
+    size_t proc_b = *rg_proc_range.begin();
+    size_t proc_e = *rg_proc_range.end();
+    size_t proc_s = rg_proc_range.step();
+    size_t n_procs = rg_proc_range.size();
+    std::stringstream ss;
+    for (auto &tg : rg_json["TaskGroups"]) {
+        size_t task_id = tg["TaskId"];
+        auto &task_info = task_infos[task_id];
+        Range<size_t> task_range(tg["TaskRange"][0], tg["TaskRange"][1]);
+        size_t task_gran = tg["Granularity"];
+        size_t num_warps_per_task = task_info["NumWarps"];
+        size_t sram_bytes_per_task = task_info["SramBytes"];
+        // number of concurrent tasks per processor
+        size_t n_slots;
+        if (sram_bytes_per_task > 0) {
+            n_slots = std::min(total_warps / num_warps_per_task,
+                               total_sram / sram_bytes_per_task);
+        } else {
+            n_slots = total_warps / num_warps_per_task;
+        }
+        if (n_slots == 0) {
+            ERR(SchedulerError, "not enough resources for task group");
+        }
+
+        size_t task_b = *task_range.begin();
+        size_t task_e = *task_range.end();
+        size_t task_s = task_range.step();
+        size_t n_tasks = task_range.size();
+
+        size_t slot_n_warps = num_warps_per_task;
+        size_t slot_n_sram = total_sram / n_slots;
+        size_t sram_per_warp = slot_n_sram / slot_n_warps;
+
+        //
+        // Distribute tasks to processors.
+        //
+        // A sequence [b, e, s] means the range starts from `b`, ends at
+        // `e - 1`, and the step size is `s`.
+        //
+        // Processor ID sequence: [proc_b, proc_e, proc_s], total `n_procs`
+        // Task ID sequence: [task_b, task_e, task_s], total `n_tasks`
+        //
+        // The distribution starts from the processor ID `proc_cur` and wraps
+        // around (`proc_cur - proc_b` is always a multiple of `proc_s`).
+        // If `task_gran` is 1, the distribution is round-robin; otherwise,
+        // the distribution assigns `task_gran` consequent tasks to each
+        // processor, as long as there are enough tasks.
+        // We distribute tasks from smaller task IDs to larger task IDs.
+        // Therefore, the `t`-th assigned task ID of the processor ID
+        // `(proc_cur + proc_s*p)%n_procs` is (p in range [0, n_procs-1]):
+        //
+        // ```
+        // task_b + task_s*(
+        //     p*task_gran +
+        //     t/task_gran*task_gran*n_procs +
+        //     t%task_gran
+        // )
+        // ```
+        //
+        // where the division is integer division.
+        //
+        // Within a single processor, `n_slots` consequent tasks are
+        // distributed to warps and SRAMs. Specifically, say that
+        // "k-th slot" refers to the set of warps `k * slot_n_warps` ~
+        // `(k+1) * slot_n_warps - 1` and SRAMs `k * slot_n_sram` ~
+        // `(k+1) * slot_n_sram - 1`, then the `t`-th task is assigned to
+        // the `t%n_slots`-th slot.
+        //
+        // Therefore, the `i`-th assigned task ID of the processor ID
+        // `(proc_cur + p)%n_procs` and the `k`-th slot is (p in range
+        // [0, n_procs-1], k in range [0, n_slots-1]) the same as the above
+        // formula with `t` replaced by `k + i*n_slots`:
+        //
+        // ```
+        // task_b + task_s*(
+        //     p*task_gran +
+        //     (k + i*n_slots)/task_gran*task_gran*n_procs +
+        //     (k + i*n_slots)%task_gran
+        // )
+        // ```
+        //
+        // The corresponding CUDA code is generated as follows, saying that
+        // `blockIdx.x` is the processor ID:
+        //
+        // ```
+        // if ((blockIdx.x >= proc_b) &&
+        //     (blockIdx.x < proc_e) &&
+        //     ((blockIdx.x - proc_b) % proc_s == 0)) {
+        //   size_t p = ((blockIdx.x + gridDim.x - proc_cur) % gridDim.x) /
+        //   proc_s; size_t k = threadIdx.x / warp_size / slot_n_warps; size_t
+        //   task_id_base = task_b + task_s*p*task_gran; for (size_t t = k; ; t
+        //   += n_slots) {
+        //     size_t task_id = task_id_base + task_s*(
+        //       t/task_gran*task_gran*n_procs + t%task_gran
+        //     );
+        //     if (task_id >= task_e) break;
+        //     task_func(_buf, task_id, sram_per_warp);
+        //   }
+        //   __syncthreads();
+        // }
+        // ```
+        ss << "  ";
+        ss << this->task_seq(proc_b, proc_e, proc_s, proc_cur, task_b, task_e,
+                             task_s, task_gran, n_slots, slot_n_warps,
+                             slot_n_sram, task_id);
+    }
+    return ss.str();
+}
+
+CodeGenerator::CodeGenerator(const std::string &plan, const std::string &name)
+    : impl_(std::make_shared<Impl>(plan, name)) {}
+
+std::string CodeGenerator::code() const { return impl_->code_; }
+
+size_t CodeGenerator::num_procs() const { return impl_->num_procs_; }
+
+size_t CodeGenerator::num_warps_per_proc() const {
+    return impl_->num_warps_per_proc_;
+}
+
+size_t CodeGenerator::total_memory_bytes() const { return impl_->total_bytes_; }
+
+}  // namespace ark
diff --git a/ark/codegen/codegen.hpp b/ark/codegen/codegen.hpp
new file mode 100644
index 000000000..387afd645
--- /dev/null
+++ b/ark/codegen/codegen.hpp
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_CODEGEN_HPP_
+#define ARK_CODEGEN_HPP_
+
+#include <memory>
+#include <string>
+
+namespace ark {
+
+class CodeGenerator {
+   public:
+    CodeGenerator(const std::string &plan,
+                  const std::string &name = "ark_kernel");
+
+    ~CodeGenerator() = default;
+
+    std::string code() const;
+
+    size_t num_procs() const;
+
+    size_t num_warps_per_proc() const;
+
+    size_t total_memory_bytes() const;
+
+   private:
+    class Impl;
+    std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_CODEGEN_HPP_
diff --git a/ark/cpu_timer.cc b/ark/cpu_timer.cpp
similarity index 100%
rename from ark/cpu_timer.cc
rename to ark/cpu_timer.cpp
diff --git a/ark/dims.cc b/ark/dims.cpp
similarity index 73%
rename from ark/dims.cc
rename to ark/dims.cpp
index 385f58b28..bb57ea27d 100644
--- a/ark/dims.cc
+++ b/ark/dims.cpp
@@ -1,19 +1,22 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include "ark/dims.hpp"
+
 #include <vector>
 
-#include "include/ark.h"
+#include "error.hpp"
 #include "logging.h"
+#include "nlohmann/json.hpp"
 
 namespace ark {
 
 // Construct with given four dimensions.
 Dims::Dims(DimType d0, DimType d1, DimType d2, DimType d3) {
-    this->data[0] = d0;
-    this->data[1] = d1;
-    this->data[2] = d2;
-    this->data[3] = d3;
+    data_[0] = d0;
+    data_[1] = d1;
+    data_[2] = d2;
+    data_[3] = d3;
     if (this->is_invalid()) {
         ERR(InvalidUsageError, "invalid dims given: <", d0, ", ", d1, ", ", d2,
             ", ", d3, ">");
@@ -26,7 +29,7 @@ Dims::Dims(const Dims &dims_) {
         ERR(InvalidUsageError, "invalid dims given");
     }
     for (int i = 0; i < DIMS_LEN; ++i) {
-        this->data[i] = dims_.data[i];
+        data_[i] = dims_.data_[i];
     }
 }
 
@@ -51,25 +54,24 @@ Dims::Dims(const std::vector<DimType> &vec) {
         } else if (v < 0) {
             invalid_seen = true;
         }
-        this->data[i] = v;
+        data_[i] = v;
     }
     for (; i < DIMS_LEN; ++i) {
-        this->data[i] = NO_DIM;
+        data_[i] = NO_DIM;
     }
 }
 
 // Return the volume of dimensions. If the dimensions are invalid, return -1.
 DimType Dims::size() const {
-    const DimType *v = this->data;
-    if (v[0] == NO_DIM) {
+    if (data_[0] == NO_DIM) {
         return -1;
     }
-    DimType ret = v[0];
+    DimType ret = data_[0];
     for (int i = 1; i < DIMS_LEN; ++i) {
-        if (v[i] == NO_DIM) {
+        if (data_[i] == NO_DIM) {
             break;
         } else {
-            ret *= v[i];
+            ret *= data_[i];
         }
     }
     return ret;
@@ -77,10 +79,9 @@ DimType Dims::size() const {
 
 // Return the number of valid dimensions.
 int Dims::ndims() const {
-    const DimType *v = this->data;
     int ret = 0;
     for (; ret < DIMS_LEN; ++ret) {
-        if (v[ret] == NO_DIM) {
+        if (data_[ret] == NO_DIM) {
             break;
         }
     }
@@ -89,23 +90,33 @@ int Dims::ndims() const {
 
 // Return a new Dims object with 4 valid dimensions by prepending 1s.
 Dims Dims::dims4() const {
-    const DimType *v = this->data;
     int nd = this->ndims();
     Dims ret;
     for (int i = 0; i < DIMS_LEN - nd; ++i) {
-        ret.data[i] = 1;
+        ret.data_[i] = 1;
     }
     for (int i = 0; i < nd; ++i) {
-        ret.data[DIMS_LEN - nd + i] = v[i];
+        ret.data_[DIMS_LEN - nd + i] = data_[i];
     }
     return ret;
 }
 
+// Return true if all valid dimensions are zero.
+bool Dims::is_zeros() const {
+    if (this->is_invalid()) {
+        return false;
+    }
+    for (int i = 0; i < DIMS_LEN; ++i) {
+        if (data_[i] == NO_DIM) break;
+        if (data_[i] != 0) return false;
+    }
+    return true;
+}
+
 // Return true if the dimensions are empty.
 bool Dims::is_no_dim() const {
-    const DimType *v = this->data;
     for (int i = 0; i < DIMS_LEN; ++i) {
-        if (v[i] != NO_DIM) {
+        if (data_[i] != NO_DIM) {
             return false;
         }
     }
@@ -116,16 +127,15 @@ bool Dims::is_no_dim() const {
 bool Dims::is_invalid() const {
     // NO_DIM should not appear before a valid dimension.
     bool invalid_seen = false;
-    const DimType *v = this->data;
     for (int i = 0; i < DIMS_LEN; ++i) {
         if (invalid_seen) {
-            if (v[i] != NO_DIM) {
+            if (data_[i] != NO_DIM) {
                 return true;
             }
         } else {
-            if (v[i] == NO_DIM) {
+            if (data_[i] == NO_DIM) {
                 invalid_seen = true;
-            } else if (v[i] < 0) {
+            } else if (data_[i] < 0) {
                 return true;
             }
         }
@@ -133,6 +143,17 @@ bool Dims::is_invalid() const {
     return false;
 }
 
+std::vector<DimType> Dims::vector() const {
+    std::vector<DimType> ret;
+    for (int i = 0; i < DIMS_LEN; ++i) {
+        if (data_[i] == NO_DIM) {
+            break;
+        }
+        ret.push_back(data_[i]);
+    }
+    return ret;
+}
+
 void Dims::insert(int idx, DimType dim) {
     int nd = this->ndims();
     if (nd >= DIMS_LEN) {
@@ -145,9 +166,9 @@ void Dims::insert(int idx, DimType dim) {
         idx += nd + 1;
     }
     for (int i = nd; i > idx; --i) {
-        this->data[i] = this->data[i - 1];
+        data_[i] = data_[i - 1];
     }
-    this->data[idx] = dim;
+    data_[idx] = dim;
 }
 
 DimType Dims::erase(int idx) {
@@ -158,20 +179,14 @@ DimType Dims::erase(int idx) {
     if (idx < 0) {
         idx += nd;
     }
-    DimType ret = this->data[idx];
+    DimType ret = data_[idx];
     for (int i = idx; i < nd - 1; ++i) {
-        this->data[i] = this->data[i + 1];
+        data_[i] = data_[i + 1];
     }
-    this->data[nd - 1] = NO_DIM;
+    data_[nd - 1] = NO_DIM;
     return ret;
 }
 
-std::string Dims::serialize() const {
-    std::stringstream ss;
-    ss << *this;
-    return ss.str();
-}
-
 DimType &Dims::operator[](int idx) {
     int nd = this->ndims();
     if (idx >= nd || -idx > nd) {
@@ -180,7 +195,7 @@ DimType &Dims::operator[](int idx) {
     if (idx < 0) {
         idx += nd;
     }
-    return this->data[idx];
+    return data_[idx];
 }
 
 const DimType &Dims::operator[](int idx) const {
@@ -191,12 +206,12 @@ const DimType &Dims::operator[](int idx) const {
     if (idx < 0) {
         idx += nd;
     }
-    return this->data[idx];
+    return data_[idx];
 }
 
 bool operator==(const Dims &a, const Dims &b) {
     for (int i = 0; i < DIMS_LEN; ++i) {
-        if (a.data[i] != b.data[i]) {
+        if (a.data_[i] != b.data_[i]) {
             return false;
         }
     }
@@ -209,14 +224,12 @@ std::ostream &operator<<(std::ostream &os, const Dims &dims) {
     if (dims.is_invalid()) {
         ERR(InvalidUsageError, "invalid dims given");
     }
-    os << '<';
-    if (dims.data[0] != NO_DIM) {
-        os << dims.data[0];
-        for (int i = 1; i < DIMS_LEN; ++i) {
-            if (dims.data[i] == NO_DIM) {
-                break;
-            }
-            os << ", " << dims.data[i];
+    int ndims = dims.ndims();
+    os << "<";
+    if (ndims > 0) {
+        os << dims[0];
+        for (int i = 1; i < ndims; ++i) {
+            os << ", " << dims[i];
         }
     }
     os << '>';
diff --git a/ark/dims_test.cc b/ark/dims_test.cpp
similarity index 99%
rename from ark/dims_test.cc
rename to ark/dims_test.cpp
index f7cfb819f..a4e4aa087 100644
--- a/ark/dims_test.cc
+++ b/ark/dims_test.cpp
@@ -1,9 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include "ark/dims.hpp"
+
 #include <sstream>
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_dims_basic() {
@@ -180,7 +181,6 @@ ark::unittest::State test_dims_ostream() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_dims_basic);
     UNITTEST(test_dims_no_dim);
     UNITTEST(test_dims_zero);
diff --git a/ark/env.cc b/ark/env.cpp
similarity index 94%
rename from ark/env.cc
rename to ark/env.cpp
index 3cf90c2a1..3cdcb36fd 100644
--- a/ark/env.cc
+++ b/ark/env.cpp
@@ -19,7 +19,7 @@
 #define DEFAULT_ARK_DISABLE_GRAPH_OPT false
 #define DEFAULT_ARK_IGNORE_BINARY_CACHE false
 #define DEFAULT_ARK_SHM_NAME_PREFIX "ark."
-#define DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH ""
+#define DEFAULT_ARK_ENFORCE_PLAN_PATH ""
 #define DEFAULT_ARK_MSCCLPP_PORT 50051
 
 template <typename T>
@@ -75,8 +75,8 @@ Env::Env() {
     this->shm_name_prefix =
         env<std::string>("ARK_SHM_NAME_PREFIX", DEFAULT_ARK_SHM_NAME_PREFIX);
     //
-    this->enforce_kernel_code_path = env<std::string>(
-        "ARK_ENFORCE_KERNEL_CODE_PATH", DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH);
+    this->enforce_plan_path = env<std::string>("ARK_ENFORCE_PLAN_PATH",
+                                               DEFAULT_ARK_ENFORCE_PLAN_PATH);
     // Get the port number of MSCCLPP.
     this->mscclpp_port = env<int>("ARK_MSCCLPP_PORT", DEFAULT_ARK_MSCCLPP_PORT);
 }
diff --git a/ark/env.h b/ark/env.h
index 677b4eaa4..2b86704e6 100644
--- a/ark/env.h
+++ b/ark/env.h
@@ -35,8 +35,8 @@ struct Env {
     bool ignore_binary_cache;
     // Prefix of shared memory file names.
     std::string shm_name_prefix;
-    // Enforce to compile a specific kernel code file.
-    std::string enforce_kernel_code_path;
+    // Enforce to compile a specific plan file.
+    std::string enforce_plan_path;
     // MSCCL++ bootstrap port.
     int mscclpp_port;
 };
diff --git a/ark/error.hpp b/ark/error.hpp
new file mode 100644
index 000000000..6e7b77024
--- /dev/null
+++ b/ark/error.hpp
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_ERROR_HPP_
+#define ARK_ERROR_HPP_
+
+#include <stdexcept>
+#include <string>
+
+namespace ark {
+
+#define REGISTER_ERROR_TYPE(_name)                                 \
+    class _name : public std::runtime_error {                      \
+       public:                                                     \
+        _name(const std::string &msg) : std::runtime_error(msg) {} \
+    };
+
+REGISTER_ERROR_TYPE(InvalidUsageError)
+REGISTER_ERROR_TYPE(ModelError)
+REGISTER_ERROR_TYPE(SchedulerError)
+REGISTER_ERROR_TYPE(ExecutorError)
+REGISTER_ERROR_TYPE(SystemError)
+REGISTER_ERROR_TYPE(GpuError)
+REGISTER_ERROR_TYPE(RuntimeError)
+REGISTER_ERROR_TYPE(UnitTestError)
+
+}  // namespace ark
+
+#endif  // ARK_ERROR_HPP_
diff --git a/ark/executor.cc b/ark/executor.cc
index f9487dd71..a1805e9d7 100644
--- a/ark/executor.cc
+++ b/ark/executor.cc
@@ -34,7 +34,6 @@ class Executor::Impl {
     std::shared_ptr<GpuContext> ctx_;
     std::unique_ptr<BaseScheduler> sched_;
     std::unique_ptr<GpuLoopKernel> glk_;
-    std::shared_ptr<GpuStream> stream_;
 };
 
 Executor::Impl::Impl(int rank, int world_size, Model &model,
@@ -47,7 +46,6 @@ Executor::Impl::Impl(int rank, int world_size, Model &model,
     sched_->schedule();
     ctx_ = sched_->create_context();
     const GpuManager::Info &ginfo = ctx_->get_gpu_manager()->info();
-    stream_ = ctx_->get_gpu_manager()->create_stream();
     glk_ = std::make_unique<GpuLoopKernel>(
         ctx_, name, sched_->gen_code(), ginfo.num_sm, num_warps_per_sm,
         (unsigned int)ginfo.smem_block_total);
diff --git a/ark/executor/executor.cpp b/ark/executor/executor.cpp
new file mode 100644
index 000000000..03031f06f
--- /dev/null
+++ b/ark/executor/executor.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+
+#include "codegen/codegen.hpp"
+#include "env.h"
+#include "file_io.h"
+#include "gpu/gpu_loop_kernel.h"
+#include "logging.h"
+
+namespace ark {
+
+class Executor::Impl {
+   public:
+    Impl(int rank, int world_size, const std::string &plan,
+         const std::string &name);
+    ~Impl() = default;
+
+    void compile();
+    void launch();
+    void run(int iter);
+    void wait();
+    float stop();
+
+   private:
+    const int rank_;
+    const int world_size_;
+    int gpu_id_;
+
+    std::shared_ptr<GpuContext> ctx_;
+    std::unique_ptr<GpuLoopKernel> glk_;
+};
+
+Executor::Impl::Impl(int rank, int world_size, const std::string &plan,
+                     const std::string &name)
+    : rank_(rank), world_size_(world_size) {
+    gpu_id_ = rank_ % get_env().num_ranks_per_host;
+    auto gpu_mgr = GpuManager::get_instance(gpu_id_);
+    size_t smem_block_total =
+        static_cast<size_t>(gpu_mgr->info().smem_block_total);
+
+    auto &plan_path = get_env().enforce_plan_path;
+    if (!plan_path.empty()) {
+        LOG(INFO, "Enforce executor plan path: ", plan_path);
+        glk_ = std::make_unique<GpuLoopKernel>(gpu_id_, read_file(plan_path),
+                                               name, smem_block_total);
+    } else {
+        glk_ = std::make_unique<GpuLoopKernel>(gpu_id_, plan, name,
+                                               smem_block_total);
+    }
+}
+
+void Executor::Impl::compile() { glk_->compile(); }
+
+void Executor::Impl::launch() {
+    glk_->load();
+    glk_->launch(false);
+}
+
+void Executor::Impl::run(int iter) { glk_->run(iter); }
+
+void Executor::Impl::wait() { glk_->wait(); }
+
+float Executor::Impl::stop() {
+    glk_->stop();
+    return glk_->get_elapsed_msec();
+}
+
+Executor::Executor(int rank, int world_size, const std::string &plan,
+                   const std::string &name)
+    : impl_(std::make_unique<Executor::Impl>(rank, world_size, plan, name)) {}
+
+Executor::~Executor() = default;
+
+void Executor::compile() { impl_->compile(); }
+
+void Executor::launch() { impl_->launch(); }
+
+void Executor::run(int iter) { impl_->run(iter); }
+
+void Executor::wait() { impl_->wait(); }
+
+float Executor::stop() { return impl_->stop(); }
+
+}  // namespace ark
diff --git a/ark/executor/executor_test.cpp b/ark/executor/executor_test.cpp
new file mode 100644
index 000000000..0ab06bc43
--- /dev/null
+++ b/ark/executor/executor_test.cpp
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+
+#include "ark/model.hpp"
+#include "codegen/codegen.hpp"
+#include "gpu/gpu_context.h"
+#include "model/model_data_type.hpp"
+#include "nlohmann/json.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_executor_scale() {
+    ark::Model m;
+    ark::ModelTensorRef input = m.tensor({32}, ark::FP32);
+    ark::ModelTensorRef output = m.scale(input, 0.7);
+
+    auto comp = m.compress();
+    auto serialized = comp.serialize(2);
+    UNITTEST_LOG(serialized);
+
+    auto comp_json = nlohmann::json::parse(serialized);
+    std::map<size_t, size_t> buf_id_to_bytes;
+    for (auto &tns : comp_json["Tensors"]) {
+        size_t nelems;
+        if (tns.contains("Strides")) {
+            nelems =
+                ark::Dims(std::vector<ark::DimType>(tns["Strides"])).size();
+        } else {
+            nelems = ark::Dims(std::vector<ark::DimType>(tns["Shape"])).size();
+            UNITTEST_LOG("Shape: ", tns["Shape"].dump(), " ? ", nelems);
+        }
+        size_t bytes =
+            nelems * ark::ModelDataT::from_name(tns["DataType"])->bytes();
+        if (buf_id_to_bytes.find(tns["BufferId"]) != buf_id_to_bytes.end()) {
+            buf_id_to_bytes[tns["BufferId"]] =
+                std::max(buf_id_to_bytes[tns["BufferId"]], bytes);
+        } else {
+            buf_id_to_bytes[tns["BufferId"]] = bytes;
+        }
+    }
+
+    nlohmann::json j;
+    j["NumProcessors"] = 1;
+    j["NumWarpsPerProcessor"] = 1;
+
+    j["TaskInfos"] = {nlohmann::json()};
+    j["TaskInfos"][0]["Id"] = 0;
+    j["TaskInfos"][0]["NumWarps"] = 1;
+    j["TaskInfos"][0]["SramBytes"] = 0;
+    j["TaskInfos"][0]["Ops"] = {nlohmann::json()};
+    j["TaskInfos"][0]["Ops"][0]["Type"] = "Scale";
+    j["TaskInfos"][0]["Ops"][0]["Name"] = "scale";
+    j["TaskInfos"][0]["Ops"][0]["IsVirtual"] = false;
+    j["TaskInfos"][0]["Ops"][0]["ReadTensors"] = {comp_json["Tensors"][0]};
+    j["TaskInfos"][0]["Ops"][0]["WriteTensors"] = {comp_json["Tensors"][1]};
+    j["TaskInfos"][0]["Ops"][0]["ResultTensors"] = {comp_json["Tensors"][2]};
+    j["TaskInfos"][0]["Ops"][0]["Args"] = {
+        {"Factor", {"FLOAT", 0.699999988079071}}};
+    j["TaskInfos"][0]["Ops"][0]["Config"] = nlohmann::json();
+    j["TaskInfos"][0]["Ops"][0]["Config"]["NumWarps"] = 1;
+    j["TaskInfos"][0]["Ops"][0]["Config"]["Tile"] = {1, 32};
+
+    j["ProcessorGroups"] = {nlohmann::json()};
+    j["ProcessorGroups"][0]["ProcessorRange"] = {0, 1};
+    j["ProcessorGroups"][0]["ResourceGroups"] = {nlohmann::json()};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["ProcessorRange"] = {0, 1};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["WarpRange"] = {0, 1};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["SramRange"] = {0, 1};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"] = {
+        nlohmann::json()};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0]["TaskId"] = 0;
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0]["TaskRange"] =
+        {0, 1};
+    j["ProcessorGroups"][0]["ResourceGroups"][0]["TaskGroups"][0]
+     ["Granularity"] = 1;
+
+    auto ctx = ark::GpuContext::get_context(0, 1);
+    std::map<size_t, std::shared_ptr<ark::GpuBuffer>> buf_id_to_buf;
+    for (auto &kv : buf_id_to_bytes) {
+        auto buf = ctx->allocate_buffer(kv.second, 1);
+        buf_id_to_buf[kv.first] = buf;
+        UNITTEST_LOG("Allocated buffer ", kv.first, ": offset ",
+                     buf->get_offset(), ", bytes ", buf->get_bytes(), " ? ",
+                     kv.second);
+    }
+    ctx->freeze();
+
+    std::map<size_t, size_t> tns_id_to_offset;
+    for (auto &tns : comp_json["Tensors"]) {
+        auto buf = buf_id_to_buf[tns["BufferId"]];
+        auto offset = buf->get_offset();
+        tns_id_to_offset[tns["Id"]] = offset;
+        UNITTEST_LOG("Tensor ", tns["Id"], ": offset ", offset);
+    }
+
+    j["Context"] = nlohmann::json();
+    j["Context"]["TensorIdToOffset"] = nlohmann::json();
+    for (auto &kv : tns_id_to_offset) {
+        j["Context"]["TensorIdToOffset"][std::to_string(kv.first)] = kv.second;
+    }
+
+    UNITTEST_LOG(j.dump(2));
+
+    ark::Executor exe(0, 1, j.dump(), "executor_test");
+    // ark::CodeGenerator codegen(j.dump());
+    // UNITTEST_LOG(codegen.code());
+    exe.compile();
+    exe.launch();
+    exe.run(1);
+    exe.stop();
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_executor_scale);
+    return 0;
+}
diff --git a/ark/file_io.cc b/ark/file_io.cpp
similarity index 98%
rename from ark/file_io.cc
rename to ark/file_io.cpp
index d564d97fb..76bb9983e 100644
--- a/ark/file_io.cc
+++ b/ark/file_io.cpp
@@ -7,7 +7,6 @@
 #include <fstream>
 #include <sstream>
 
-#include "include/ark.h"
 #include "logging.h"
 
 namespace fs = std::filesystem;
diff --git a/ark/file_io_test.cc b/ark/file_io_test.cpp
similarity index 98%
rename from ark/file_io_test.cc
rename to ark/file_io_test.cpp
index 9b69c0cde..3e56f3a42 100644
--- a/ark/file_io_test.cc
+++ b/ark/file_io_test.cpp
@@ -7,7 +7,6 @@
 #include <fstream>
 
 #include "env.h"
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_is_exist() {
@@ -101,7 +100,6 @@ ark::unittest::State test_read_write_file() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_is_exist);
     UNITTEST(test_is_dir);
     UNITTEST(test_is_file);
diff --git a/ark/gpu/gpu.h b/ark/gpu/gpu.h
index 2f1eba3ba..1d117e939 100644
--- a/ark/gpu/gpu.h
+++ b/ark/gpu/gpu.h
@@ -21,7 +21,7 @@
     constexpr auto alias = cuda_const;
 #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \
     template <typename... Args>                                \
-    inline auto alias(Args &&... args) {                       \
+    inline auto alias(Args &&...args) {                        \
         return cuda_func(std::forward<Args>(args)...);         \
     }
 
@@ -35,7 +35,7 @@
     constexpr auto alias = rocm_const;
 #define ARK_GPU_DEFINE_FUNC_ALIAS(alias, cuda_func, rocm_func) \
     template <typename... Args>                                \
-    inline auto alias(Args &&... args) {                       \
+    inline auto alias(Args &&...args) {                        \
         return rocm_func(std::forward<Args>(args)...);         \
     }
 
diff --git a/ark/gpu/gpu_buffer.cc b/ark/gpu/gpu_buffer.cpp
similarity index 100%
rename from ark/gpu/gpu_buffer.cc
rename to ark/gpu/gpu_buffer.cpp
diff --git a/ark/gpu/gpu_comm_sw.cc b/ark/gpu/gpu_comm_sw.cpp
similarity index 99%
rename from ark/gpu/gpu_comm_sw.cc
rename to ark/gpu/gpu_comm_sw.cpp
index 209a02c8b..5aa85a346 100644
--- a/ark/gpu/gpu_comm_sw.cc
+++ b/ark/gpu/gpu_comm_sw.cpp
@@ -13,7 +13,6 @@
 #include "env.h"
 #include "gpu/gpu_logging.h"
 #include "gpu/gpu_manager.h"
-#include "include/ark.h"
 #include "ipc/ipc_hosts.h"
 #include "ipc/ipc_socket.h"
 
diff --git a/ark/gpu/gpu_compile.cc b/ark/gpu/gpu_compile.cpp
similarity index 99%
rename from ark/gpu/gpu_compile.cc
rename to ark/gpu/gpu_compile.cpp
index 3a5911870..88eb56e98 100644
--- a/ark/gpu/gpu_compile.cc
+++ b/ark/gpu/gpu_compile.cpp
@@ -18,12 +18,11 @@
 #include <mutex>
 #include <thread>
 
+#include "ark/random.hpp"
 #include "cpu_timer.h"
 #include "env.h"
 #include "file_io.h"
 #include "gpu/gpu_logging.h"
-#include "include/ark.h"
-#include "random.h"
 
 #define ARK_DEBUG_KERNEL 0
 
diff --git a/ark/gpu/gpu_context.cc b/ark/gpu/gpu_context.cpp
similarity index 100%
rename from ark/gpu/gpu_context.cc
rename to ark/gpu/gpu_context.cpp
diff --git a/ark/gpu/gpu_context_test.cc b/ark/gpu/gpu_context_test.cpp
similarity index 99%
rename from ark/gpu/gpu_context_test.cc
rename to ark/gpu/gpu_context_test.cpp
index 27f796494..737353d1d 100644
--- a/ark/gpu/gpu_context_test.cc
+++ b/ark/gpu/gpu_context_test.cpp
@@ -5,7 +5,6 @@
 
 #include <numeric>
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 // Test initializing and destroying GpuContext
@@ -178,7 +177,6 @@ ark::unittest::State test_gpu_context_remote() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_gpu_context_basic);
     UNITTEST(test_gpu_context_buffer_free);
     UNITTEST(test_gpu_context_buffer_alloc);
diff --git a/ark/gpu/gpu_event.cc b/ark/gpu/gpu_event.cpp
similarity index 100%
rename from ark/gpu/gpu_event.cc
rename to ark/gpu/gpu_event.cpp
diff --git a/ark/gpu/gpu_kernel.cc b/ark/gpu/gpu_kernel.cc
deleted file mode 100644
index 4b99bbfcf..000000000
--- a/ark/gpu/gpu_kernel.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "gpu/gpu_kernel.h"
-
-#include <cassert>
-#include <cstring>
-
-#include "gpu/gpu.h"
-#include "gpu/gpu_compile.h"
-#include "gpu/gpu_logging.h"
-
-namespace ark {
-
-GpuKernel::GpuKernel(
-    std::shared_ptr<GpuContext> ctx, const std::string& codes,
-    const std::array<int, 3>& block_dim, const std::array<int, 3>& grid_dim,
-    size_t smem_bytes, const std::string& kernel_name,
-    std::initializer_list<std::pair<std::shared_ptr<void>, size_t>> args)
-    : ctx_(ctx),
-      codes_(codes),
-      block_dim_(block_dim),
-      grid_dim_(grid_dim),
-      smem_bytes_(smem_bytes),
-      kernel_name_(kernel_name),
-      params_ptr_(args.size(), nullptr),
-      args_(args.size(), nullptr) {
-    if (kernel_name_.size() == 0) {
-        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
-    }
-    int idx = 0;
-    for (auto& pair : args) {
-        std::shared_ptr<void> ptr =
-            std::shared_ptr<uint8_t[]>(new uint8_t[pair.second]);
-        assert(ptr != nullptr);
-        if (pair.first != nullptr) {
-            std::memcpy(ptr.get(), pair.first.get(), pair.second);
-        }
-        // make sure the shared_ptr is not released
-        this->args_[idx] = ptr;
-        this->params_ptr_[idx++] = ptr.get();
-    }
-}
-
-void GpuKernel::compile() {
-    auto manager = ctx_->get_gpu_manager();
-    int max_reg_per_block = manager->info().max_registers_per_block;
-    int max_reg_per_thread = manager->info().max_registers_per_thread;
-    int max_reg_cnt =
-        max_reg_per_block / (block_dim_[0] * block_dim_[1] * block_dim_[2]);
-    if (max_reg_cnt >= max_reg_per_thread) {
-        max_reg_cnt = max_reg_per_thread - 1;
-    }
-    bin_ = gpu_compile({codes_}, manager->info().arch, max_reg_cnt);
-    GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str()));
-    GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str()));
-
-    int static_smem_size_bytes;
-    GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
-                                 gpuFuncAttributeSharedSizeBytes, function_));
-    int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
-    GLOG_DRV(gpuFuncSetAttribute(function_,
-                                 gpuFuncAttributeMaxDynamicSharedSizeBytes,
-                                 dynamic_smem_size_bytes));
-}
-
-void GpuKernel::launch(std::shared_ptr<GpuStream> stream) {
-    if (!this->is_compiled()) {
-        ERR(InvalidUsageError, "Kernel is not compiled yet.");
-    }
-    ctx_->get_gpu_manager()->launch(function_, grid_dim_, block_dim_,
-                                    smem_bytes_, stream,
-                                    this->params_ptr_.data(), nullptr);
-}
-
-}  // namespace ark
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
new file mode 100644
index 000000000..743322d3a
--- /dev/null
+++ b/ark/gpu/gpu_kernel.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "gpu/gpu_kernel.h"
+
+#include <cassert>
+#include <cstring>
+
+#include "gpu/gpu.h"
+#include "gpu/gpu_compile.h"
+#include "gpu/gpu_logging.h"
+
+namespace ark {
+
+GpuKernel::GpuKernel(int gpu_id, const std::string& code,
+                     const std::array<int, 3>& block_dim,
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
+                     const std::string& kernel_name,
+                     std::initializer_list<std::pair<void*, size_t>> args) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name,
+               args);
+}
+
+void GpuKernel::init(int gpu_id, const std::string& code,
+                     const std::array<int, 3>& block_dim,
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
+                     const std::string& kernel_name,
+                     std::initializer_list<std::pair<void*, size_t>> args) {
+    gpu_manager_ = GpuManager::get_instance(gpu_id);
+    code_ = code;
+    block_dim_ = block_dim;
+    grid_dim_ = grid_dim;
+    smem_bytes_ = smem_bytes;
+    kernel_name_ = kernel_name;
+    params_ptr_.resize(args.size());
+    args_.resize(args.size());
+    if (kernel_name_.size() == 0) {
+        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
+    }
+    size_t idx = 0;
+    for (auto& pair : args) {
+        args_[idx].reset(new uint8_t[pair.second]);
+        std::memcpy(args_[idx].get(), &(pair.first), pair.second);
+        params_ptr_[idx] = static_cast<void*>(args_[idx].get());
+        idx++;
+    }
+}
+
+void GpuKernel::compile() {
+    int max_reg_per_block = gpu_manager_->info().max_registers_per_block;
+    int max_reg_per_thread = gpu_manager_->info().max_registers_per_thread;
+    int max_reg_cnt =
+        max_reg_per_block / (block_dim_[0] * block_dim_[1] * block_dim_[2]);
+    if (max_reg_cnt >= max_reg_per_thread) {
+        max_reg_cnt = max_reg_per_thread - 1;
+    }
+    bin_ = gpu_compile({code_}, gpu_manager_->info().arch, max_reg_cnt);
+    GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str()));
+    GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str()));
+
+    int static_smem_size_bytes;
+    GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
+                                 gpuFuncAttributeSharedSizeBytes, function_));
+    int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
+    GLOG_DRV(gpuFuncSetAttribute(function_,
+                                 gpuFuncAttributeMaxDynamicSharedSizeBytes,
+                                 dynamic_smem_size_bytes));
+}
+
+void GpuKernel::launch(std::shared_ptr<GpuStream> stream) {
+    if (!this->is_compiled()) {
+        ERR(InvalidUsageError, "Kernel is not compiled yet.");
+    }
+    gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
+                         params_ptr_.data(), nullptr);
+}
+
+}  // namespace ark
diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h
index 18149fc3f..ab08c4156 100644
--- a/ark/gpu/gpu_kernel.h
+++ b/ark/gpu/gpu_kernel.h
@@ -14,19 +14,24 @@ namespace ark {
 
 class GpuKernel {
    public:
-    GpuKernel(std::shared_ptr<GpuContext> ctx, const std::string& codes,
+    GpuKernel() {}
+    GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
               const std::array<int, 3>& grid_dim, size_t smem_bytes,
               const std::string& kernel_name,
-              std::initializer_list<std::pair<std::shared_ptr<void>, size_t>>
-                  args = {});
+              std::initializer_list<std::pair<void*, size_t>> args = {});
 
+    void init(int gpu_id, const std::string& codes,
+              const std::array<int, 3>& block_dim,
+              const std::array<int, 3>& grid_dim, size_t smem_bytes,
+              const std::string& kernel_name,
+              std::initializer_list<std::pair<void*, size_t>> args = {});
     void compile();
     void launch(std::shared_ptr<GpuStream> stream);
 
    protected:
-    std::shared_ptr<GpuContext> ctx_;
-    std::string codes_;
+    std::shared_ptr<GpuManager> gpu_manager_;
+    std::string code_;
     std::array<int, 3> block_dim_;
     std::array<int, 3> grid_dim_;
     int smem_bytes_;
@@ -35,7 +40,7 @@ class GpuKernel {
     gpuModule module_;
     gpuFunction function_ = nullptr;
     std::vector<void*> params_ptr_;
-    std::vector<std::shared_ptr<void>> args_;
+    std::vector<std::shared_ptr<uint8_t[]>> args_;
 
     bool is_compiled() const { return function_ != nullptr; }
 };
diff --git a/ark/gpu/gpu_kernel_test.cc b/ark/gpu/gpu_kernel_test.cc
deleted file mode 100644
index 3230f6326..000000000
--- a/ark/gpu/gpu_kernel_test.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "gpu/gpu_kernel.h"
-
-#include "gpu/gpu_loop_kernel.h"
-#include "include/ark.h"
-#include "unittest/unittest_utils.h"
-
-const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
-
-ark::unittest::State test_gpu_kernel() {
-    auto ctx = ark::GpuContext::get_context(0, 1);
-    ark::GpuKernel kernel(ctx, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
-    kernel.compile();
-    return ark::unittest::SUCCESS;
-}
-
-//
-const std::string test_kernel_loop_void =
-    "__device__ void ark_loop_body(char *_buf, int _iter) {\n"
-    "  // Do nothing. Print iteration counter.\n"
-    "  if (threadIdx.x == 0 && blockIdx.x == 0) {\n"
-    "    if (_iter % 50 == 49) {\n"
-    "      printf(\".\\n\");\n"
-    "    } else {\n"
-    "      printf(\".\");\n"
-    "    }\n"
-    "  }\n"
-    "}\n";
-
-ark::unittest::State test_gpu_loop_kernel() {
-    auto ctx = ark::GpuContext::get_context(0, 1);
-    ctx->freeze();
-
-    ark::GpuLoopKernel glk{ctx,
-                           "test_kernel_loop_void",
-                           {test_kernel_loop_void},
-                           ctx->get_gpu_manager()->info().num_sm,
-                           1,
-                           0};
-    glk.compile();
-    glk.load();
-
-    glk.launch(ctx->get_gpu_manager()->create_stream());
-    glk.run(100);
-    glk.stop();
-
-    return ark::unittest::SUCCESS;
-}
-
-int main() {
-    ark::init();
-    UNITTEST(test_gpu_kernel);
-    UNITTEST(test_gpu_loop_kernel);
-    return 0;
-}
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
new file mode 100644
index 000000000..7e88db966
--- /dev/null
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "gpu/gpu_kernel.h"
+
+#include "gpu/gpu_loop_kernel.h"
+#include "unittest/unittest_utils.h"
+
+const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
+
+ark::unittest::State test_gpu_kernel() {
+    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    kernel.compile();
+    return ark::unittest::SUCCESS;
+}
+
+//
+// const std::string test_kernel_loop_void =
+//     "__device__ void ark_loop_body(char *_buf, int _iter) {\n"
+//     "  // Do nothing. Print iteration counter.\n"
+//     "  if (threadIdx.x == 0 && blockIdx.x == 0) {\n"
+//     "    if (_iter % 50 == 49) {\n"
+//     "      printf(\".\\n\");\n"
+//     "    } else {\n"
+//     "      printf(\".\");\n"
+//     "    }\n"
+//     "  }\n"
+//     "}\n";
+
+// ark::unittest::State test_gpu_loop_kernel() {
+//     int num_sm = ark::GpuManager::get_instance(0)->info().num_sm;
+//     ark::GpuLoopKernel glk(0, "test_kernel_loop_void", test_kernel_loop_void,
+//                            static_cast<size_t>(num_sm), 1, 0, 0);
+//     glk.compile();
+//     glk.load();
+
+//     glk.launch();
+//     glk.run(100);
+//     glk.stop();
+
+//     return ark::unittest::SUCCESS;
+// }
+
+int main() {
+    UNITTEST(test_gpu_kernel);
+    // UNITTEST(test_gpu_loop_kernel);
+    return 0;
+}
diff --git a/ark/gpu/gpu_logging.h b/ark/gpu/gpu_logging.h
index ac2a4abc6..b14435b8b 100644
--- a/ark/gpu/gpu_logging.h
+++ b/ark/gpu/gpu_logging.h
@@ -5,7 +5,6 @@
 #define ARK_GPU_LOGGING_H_
 
 #include "gpu/gpu.h"
-#include "include/ark.h"
 #include "logging.h"
 
 #define GLOG(cmd)                                           \
diff --git a/ark/gpu/gpu_loop_kernel.cc b/ark/gpu/gpu_loop_kernel.cc
deleted file mode 100644
index 791a22b5a..000000000
--- a/ark/gpu/gpu_loop_kernel.cc
+++ /dev/null
@@ -1,278 +0,0 @@
-#include "gpu/gpu_loop_kernel.h"
-
-#include <sstream>
-
-#include "env.h"
-#include "file_io.h"
-#include "gpu/gpu.h"
-#include "gpu/gpu_event.h"
-#include "gpu/gpu_logging.h"
-
-#define MAX_LOOP_COUNTER 10000000
-
-#if defined(ARK_CUDA)
-#include <cuda/atomic>
-static int atomicLoadRelaxed(int* ptr) {
-    return cuda::atomic_ref<int, cuda::thread_scope_system>{*ptr}.load(
-        cuda::memory_order_relaxed);
-}
-static void atomicStoreRelaxed(int* ptr, int val) {
-    cuda::atomic_ref<int, cuda::thread_scope_system>{*ptr}.store(
-        val, cuda::memory_order_relaxed);
-}
-#elif defined(ARK_ROCM)
-static int atomicLoadRelaxed(int* ptr) {
-    return __atomic_load_n(ptr, __ATOMIC_RELAXED);
-}
-static void atomicStoreRelaxed(int* ptr, int val) {
-    __atomic_store_n(ptr, val, __ATOMIC_RELAXED);
-}
-#endif  // defined(ARK_ROCM)
-
-namespace ark {
-
-GpuLoopKernel::GpuLoopKernel(std::shared_ptr<GpuContext> ctx,
-                             const std::string& name,
-                             const std::vector<std::string>& codes_body,
-                             int num_sm, int num_warp, unsigned int smem_bytes)
-    : GpuKernel(
-          ctx, {},
-          {num_warp * ctx->get_gpu_manager()->info().threads_per_warp, 1, 1},
-          {num_sm, 1, 1}, (smem_bytes < 4) ? 4 : smem_bytes, name,
-          {{nullptr, sizeof(GpuPtr)}}),
-      timer_begin_(ctx->get_gpu_manager()->create_event()),
-      timer_end_(ctx->get_gpu_manager()->create_event()) {
-    flag_ = ctx->get_gpu_manager()->malloc_host(
-        sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
-    *(int**)params_ptr_[0] = (int*)flag_->ref<int>();
-
-    auto& code_path = get_env().enforce_kernel_code_path;
-    if (!code_path.empty()) {
-        LOG(INFO, "Enforce kernel code path: ", code_path);
-        codes_ = std::move(read_file(code_path));
-    } else if (codes_body.size() > 0) {
-        const std::string* ark_loop_body_code = nullptr;
-        for (auto& code : codes_body) {
-            if (code.find("ark_loop_body") != std::string::npos) {
-                ark_loop_body_code = &code;
-                break;
-            }
-        }
-        assert(ark_loop_body_code != nullptr);
-
-        std::stringstream ss;
-        // clang-format off
-        ss <<
-        "// THIS KERNEL IS MACHINE-GENERATED BY ARK.\n"
-        "#define ARK_THREADS_PER_BLOCK " << block_dim_[0] << "\n"
-        "__device__ int _ITER = 0;\n"
-        "#include \"ark_kernels.h\"\n"
-        "using namespace ark;\n"
-        "__device__ sync::State " ARK_LSS_NAME ";\n"
-        "__device__ char *" ARK_BUF_NAME ";\n"
-        << *ark_loop_body_code <<
-        "extern \"C\" __global__ __launch_bounds__(" << block_dim_[0] << ", 1)\n"
-        "void " << kernel_name_ << "(int *_it)\n"
-        "{\n"
-        "  char *_buf = " ARK_BUF_NAME ";\n"
-        "  int *shared_mem = (int *)_ARK_SMEM;\n"
-        "  for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {\n"
-        "    shared_mem[i] = 0;\n"
-        "  }\n"
-        "  for (;;) {\n"
-        "    if (threadIdx.x == 0 && blockIdx.x == 0) {\n"
-        "      int iter;\n"
-        "      while ((iter = atomicLoadRelaxed(_it)) == 0) {}\n"
-        "      _ITER = iter;\n"
-        "    }\n"
-        "    sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n"
-        "    if (_ITER < 0) {\n"
-        "      return;\n"
-        "    }\n"
-        "    for (int _i = 0; _i < _ITER; ++_i) {\n"
-        "      ark_loop_body(_buf, _i);\n"
-        "      sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n"
-        "    }\n"
-        "    if (threadIdx.x == 0 && blockIdx.x == 0) {\n"
-        "      atomicStoreRelaxed(_it, 0);\n"
-        "    }\n"
-        "    sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n"
-        "  }\n"
-        "}\n";
-        // clang-format on
-        codes_ = std::move(ss.str());
-    }
-}
-
-void GpuLoopKernel::load() {
-    if (!is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
-    }
-    if (stream_ != nullptr) {
-        // Wait until previous works finish.
-        wait();
-        return;
-    }
-    // Initialize global variables in the loop kernel.
-    std::shared_ptr<GpuManager> manager = ctx_->get_gpu_manager();
-    void* buf_ptr_val = ctx_->get_data_memory()->ref();
-    GpuPtr lss_ptr_addr;
-    GpuPtr buf_ptr_addr;
-    size_t tmp = 0;
-    GLOG_DRV(gpuModuleGetGlobal(&lss_ptr_addr, &tmp, module_, ARK_LSS_NAME));
-    GLOG_DRV(gpuModuleGetGlobal(&buf_ptr_addr, &tmp, module_, ARK_BUF_NAME));
-    std::array<int, 4> data = {0, 0, 0, 0};
-    manager->memcpy_htod((void*)lss_ptr_addr, 0, data.data(), 0,
-                         sizeof(int) * data.size());
-    manager->memcpy_htod((void*)buf_ptr_addr, 0, &buf_ptr_val, 0,
-                         sizeof(GpuPtr));
-    // TODO: remove this hack
-    GpuPtr lss_0_ptr_addr;
-    GpuPtr lss_1_ptr_addr;
-    gpuDrvError ret =
-        gpuModuleGetGlobal(&lss_0_ptr_addr, &tmp, module_, ARK_LSS_NAME "_0");
-    if (ret == gpuDrvSuccess) {
-        manager->memcpy_htod((void*)lss_0_ptr_addr, 0, data.data(), 0,
-                             sizeof(int) * data.size());
-    } else if (ret != gpuErrorNotFound) {
-        GLOG_DRV(ret);
-    }
-    ret = gpuModuleGetGlobal(&lss_1_ptr_addr, &tmp, module_, ARK_LSS_NAME "_1");
-    if (ret == gpuDrvSuccess) {
-        manager->memcpy_htod((void*)lss_1_ptr_addr, 0, data.data(), 0,
-                             sizeof(int) * data.size());
-    } else if (ret != gpuErrorNotFound) {
-        GLOG_DRV(ret);
-    }
-    // set the data buffer pointers of remote gpus
-    int nrph = get_env().num_ranks_per_host;
-    int nodes_id = ctx_->gpu_id() / nrph;
-    // only set the GPU remote data buf pointers of the GPUs on the same node
-    for (int i = nodes_id * nrph;
-         i < (nodes_id + 1) * nrph && i < ctx_->world_size(); i++) {
-        void* data_buf_value = ctx_->get_data_memory(i)->ref();
-        if (data_buf_value == 0) {
-            continue;
-        }
-        GpuPtr data_buf_ptr;
-        std::string data_buf_name = ARK_BUF_NAME + std::to_string(i);
-        gpuDrvError _e = gpuModuleGetGlobal(&data_buf_ptr, &tmp, module_,
-                                            data_buf_name.c_str());
-        if (_e == gpuErrorNotFound) {
-            LOG(DEBUG, "global variable ", data_buf_name, " not found");
-            continue;
-        }
-        LOG(DEBUG, data_buf_name, " data_buf_ptr=", std::hex, data_buf_ptr,
-            " data_buf_value=", data_buf_value);
-        manager->memcpy_htod((void*)data_buf_ptr, 0, &data_buf_value, 0,
-                             sizeof(GpuPtr));
-    }
-
-    std::shared_ptr<GpuCommSw> comm = ctx_->get_comm_sw();
-    if (comm->get_proxy_channels_num() > 0) {
-        GpuPtr channel_addr;
-        GLOG_DRV(gpuModuleGetGlobal(&channel_addr, &tmp, module_,
-                                    "_ARK_PROXY_CHANS"));
-        const void* chans_ref = comm->get_proxy_channels_ref();
-        size_t chans_bytes = comm->get_proxy_channels_bytes();
-        manager->memcpy_htod((void*)channel_addr, 0,
-                             const_cast<void*>(chans_ref), 0, chans_bytes);
-    }
-    if (comm->get_sm_channels_num() > 0) {
-        GpuPtr channel_addr;
-        GLOG_DRV(
-            gpuModuleGetGlobal(&channel_addr, &tmp, module_, "_ARK_SM_CHANS"));
-        const void* chans_ref = comm->get_sm_channels_ref();
-        size_t chans_bytes = comm->get_sm_channels_bytes();
-        manager->memcpy_htod((void*)channel_addr, 0,
-                             const_cast<void*>(chans_ref), 0, chans_bytes);
-    }
-}
-
-void GpuLoopKernel::launch(std::shared_ptr<GpuStream> stream,
-                           bool disable_timing) {
-    elapsed_msec_ = -1;
-    if (!is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
-    } else if (stream == nullptr) {
-        ERR(InvalidUsageError, "Given an invalid stream.");
-    } else if (stream_ != nullptr) {
-        if (stream_ == stream) {
-            LOG(WARN, "Ignore launching twice.");
-            return;
-        } else {
-            ERR(InvalidUsageError, "This loop kernel is already running.");
-        }
-    }
-    if (!disable_timing) {
-        timer_begin_->record(stream);
-    }
-
-    ctx_->get_comm_sw()->launch_request_loop();
-
-    // Initialize loop flags.
-    atomicStoreRelaxed(flag_->ref<int>(), 0);
-    GpuKernel::launch(stream);
-    stream_ = stream;
-    if (!disable_timing) {
-        timer_end_->record(stream);
-        is_recording_ = true;
-    }
-}
-
-void GpuLoopKernel::run(int iter) {
-    if (iter > 0) {
-        while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-        }
-        atomicStoreRelaxed(flag_->ref<int>(), iter);
-    }
-}
-
-bool GpuLoopKernel::poll() { return atomicLoadRelaxed(flag_->ref<int>()) <= 0; }
-
-void GpuLoopKernel::wait() {
-    int cnt = MAX_LOOP_COUNTER;
-    while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-        if (--cnt > 0) {
-            continue;
-        }
-        // Check if the kernel encountered an error.
-        gpuError res = stream_->query();
-        if (res == gpuSuccess) {
-            if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                LOG(WARN, "Stream is finished but the loop flag is still set.");
-                break;
-            } else {
-                LOG(WARN,
-                    "wait() is delayed by a stream query. Regarding "
-                    "timing measurements may be inaccurate.");
-                break;
-            }
-        } else if (res == gpuErrorNotReady) {
-            cnt = MAX_LOOP_COUNTER;
-        } else {
-            GLOG(res);
-        }
-    }
-}
-
-void GpuLoopKernel::stop() {
-    wait();
-    atomicStoreRelaxed(flag_->ref<int>(), -1);
-    stream_->sync();
-    if (is_recording_) {
-        elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_);
-        is_recording_ = false;
-    }
-    stream_ = nullptr;
-    ctx_->get_comm_sw()->stop_request_loop();
-}
-
-float GpuLoopKernel::get_elapsed_msec() const {
-    if (is_recording_) {
-        ERR(InvalidUsageError, "Need to stop the kernel first.");
-    }
-    return elapsed_msec_;
-}
-
-}  // namespace ark
diff --git a/ark/gpu/gpu_loop_kernel.cpp b/ark/gpu/gpu_loop_kernel.cpp
new file mode 100644
index 000000000..32610c20b
--- /dev/null
+++ b/ark/gpu/gpu_loop_kernel.cpp
@@ -0,0 +1,246 @@
+#include "gpu/gpu_loop_kernel.h"
+
+#include <sstream>
+
+#include "codegen/codegen.hpp"
+#include "env.h"
+#include "file_io.h"
+#include "gpu/gpu.h"
+#include "gpu/gpu_event.h"
+#include "gpu/gpu_logging.h"
+
+#define MAX_LOOP_COUNTER 10000000
+
+#if defined(ARK_CUDA)
+#include <cuda/atomic>
+static int atomicLoadRelaxed(int* ptr) {
+    return cuda::atomic_ref<int, cuda::thread_scope_system>{*ptr}.load(
+        cuda::memory_order_relaxed);
+}
+static void atomicStoreRelaxed(int* ptr, int val) {
+    cuda::atomic_ref<int, cuda::thread_scope_system>{*ptr}.store(
+        val, cuda::memory_order_relaxed);
+}
+#elif defined(ARK_ROCM)
+static int atomicLoadRelaxed(int* ptr) {
+    return __atomic_load_n(ptr, __ATOMIC_RELAXED);
+}
+static void atomicStoreRelaxed(int* ptr, int val) {
+    __atomic_store_n(ptr, val, __ATOMIC_RELAXED);
+}
+#endif  // defined(ARK_ROCM)
+
+namespace ark {
+
+class GpuLoopKernel::Impl {
+   public:
+    Impl(int gpu_id, const std::string& plan, const std::string& name,
+         size_t smem_bytes);
+    ~Impl() = default;
+
+   protected:
+    friend class GpuLoopKernel;
+
+    std::shared_ptr<CodeGenerator> codegen_;
+    std::shared_ptr<GpuEvent> timer_begin_;
+    std::shared_ptr<GpuEvent> timer_end_;
+    std::shared_ptr<GpuMemory> buffer_;
+    std::shared_ptr<GpuHostMemory> flag_;
+    std::shared_ptr<GpuStream> stream_;
+};
+
+GpuLoopKernel::Impl::Impl(int gpu_id, const std::string& plan,
+                          const std::string& name,
+                          [[maybe_unused]] size_t smem_bytes) {
+    auto gpu_manager = GpuManager::get_instance(gpu_id);
+    codegen_ = std::make_shared<CodeGenerator>(plan, name);
+    timer_begin_ = gpu_manager->create_event();
+    timer_end_ = gpu_manager->create_event();
+    buffer_ = gpu_manager->malloc(codegen_->total_memory_bytes());
+    flag_ = gpu_manager->malloc_host(
+        sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
+    stream_ = gpu_manager->create_stream();
+}
+
+GpuLoopKernel::GpuLoopKernel(int gpu_id, const std::string& plan,
+                             const std::string& name, size_t smem_bytes)
+    : GpuKernel(),
+      impl_(std::make_shared<Impl>(gpu_id, plan, name, smem_bytes)) {
+    auto gpu_manager = GpuManager::get_instance(gpu_id);
+    int threads_per_block =
+        static_cast<int>(impl_->codegen_->num_warps_per_proc() *
+                         gpu_manager->info().threads_per_warp);
+    int num_sm = static_cast<int>(impl_->codegen_->num_procs());
+    int *flag = impl_->flag_->ref<int>();
+    this->init(gpu_id, impl_->codegen_->code(), {threads_per_block, 1, 1},
+               {num_sm, 1, 1}, (smem_bytes < 4) ? 4 : smem_bytes, name,
+               {{flag, sizeof(flag)}});
+}
+
+void GpuLoopKernel::load() {
+    if (!is_compiled()) {
+        ERR(InvalidUsageError, "Need to compile first before initialization.");
+    }
+    if (is_launched_) {
+        // Wait until previous works finish.
+        wait();
+        return;
+    }
+    // Initialize global variables in the loop kernel.
+    void* buf_ptr_val = impl_->buffer_->ref();
+    GpuPtr lss_ptr_addr;
+    GpuPtr buf_ptr_addr;
+    size_t tmp = 0;
+    GLOG_DRV(gpuModuleGetGlobal(&lss_ptr_addr, &tmp, module_, ARK_LSS_NAME));
+    GLOG_DRV(gpuModuleGetGlobal(&buf_ptr_addr, &tmp, module_, ARK_BUF_NAME));
+    std::array<int, 4> data = {0, 0, 0, 0};
+    gpu_manager_->memcpy_htod((void*)lss_ptr_addr, 0, data.data(), 0,
+                              sizeof(int) * data.size());
+    gpu_manager_->memcpy_htod((void*)buf_ptr_addr, 0, &buf_ptr_val, 0,
+                              sizeof(GpuPtr));
+    // TODO: remove this hack
+    GpuPtr lss_0_ptr_addr;
+    GpuPtr lss_1_ptr_addr;
+    gpuDrvError ret =
+        gpuModuleGetGlobal(&lss_0_ptr_addr, &tmp, module_, ARK_LSS_NAME "_0");
+    if (ret == gpuDrvSuccess) {
+        gpu_manager_->memcpy_htod((void*)lss_0_ptr_addr, 0, data.data(), 0,
+                                  sizeof(int) * data.size());
+    } else if (ret != gpuErrorNotFound) {
+        GLOG_DRV(ret);
+    }
+    ret = gpuModuleGetGlobal(&lss_1_ptr_addr, &tmp, module_, ARK_LSS_NAME "_1");
+    if (ret == gpuDrvSuccess) {
+        gpu_manager_->memcpy_htod((void*)lss_1_ptr_addr, 0, data.data(), 0,
+                                  sizeof(int) * data.size());
+    } else if (ret != gpuErrorNotFound) {
+        GLOG_DRV(ret);
+    }
+    // set the data buffer pointers of remote gpus
+    // int nrph = get_env().num_ranks_per_host;
+    // int nodes_id = gpu_manager_->get_gpu_id() / nrph;
+    // // only set the GPU remote data buf pointers of the GPUs on the same node
+    // for (int i = nodes_id * nrph;
+    //      i < (nodes_id + 1) * nrph && i < ctx_->world_size(); i++) {
+    //     void* data_buf_value = ctx_->get_data_memory(i)->ref();
+    //     if (data_buf_value == 0) {
+    //         continue;
+    //     }
+    //     GpuPtr data_buf_ptr;
+    //     std::string data_buf_name = ARK_BUF_NAME + std::to_string(i);
+    //     gpuDrvError _e = gpuModuleGetGlobal(&data_buf_ptr, &tmp, module_,
+    //                                         data_buf_name.c_str());
+    //     if (_e == gpuErrorNotFound) {
+    //         LOG(DEBUG, "global variable ", data_buf_name, " not found");
+    //         continue;
+    //     }
+    //     LOG(DEBUG, data_buf_name, " data_buf_ptr=", std::hex, data_buf_ptr,
+    //         " data_buf_value=", data_buf_value);
+    //     gpu_manager_->memcpy_htod((void*)data_buf_ptr, 0, &data_buf_value, 0,
+    //                          sizeof(GpuPtr));
+    // }
+
+    // std::shared_ptr<GpuCommSw> comm = ctx_->get_comm_sw();
+    // if (comm->get_proxy_channels_num() > 0) {
+    //     GpuPtr channel_addr;
+    //     GLOG_DRV(gpuModuleGetGlobal(&channel_addr, &tmp, module_,
+    //                                 "_ARK_PROXY_CHANS"));
+    //     const void* chans_ref = comm->get_proxy_channels_ref();
+    //     size_t chans_bytes = comm->get_proxy_channels_bytes();
+    //     gpu_manager_->memcpy_htod((void*)channel_addr, 0,
+    //                          const_cast<void*>(chans_ref), 0, chans_bytes);
+    // }
+    // if (comm->get_sm_channels_num() > 0) {
+    //     GpuPtr channel_addr;
+    //     GLOG_DRV(
+    //         gpuModuleGetGlobal(&channel_addr, &tmp, module_,
+    //         "_ARK_SM_CHANS"));
+    //     const void* chans_ref = comm->get_sm_channels_ref();
+    //     size_t chans_bytes = comm->get_sm_channels_bytes();
+    //     gpu_manager_->memcpy_htod((void*)channel_addr, 0,
+    //                          const_cast<void*>(chans_ref), 0, chans_bytes);
+    // }
+}
+
+void GpuLoopKernel::launch(bool disable_timing) {
+    elapsed_msec_ = -1;
+    if (!is_compiled()) {
+        ERR(InvalidUsageError, "Need to compile first before initialization.");
+    } else if (is_launched_) {
+        LOG(WARN, "Ignore launching twice.");
+        return;
+    }
+    if (!disable_timing) {
+        impl_->timer_begin_->record(impl_->stream_);
+    }
+
+    // ctx_->get_comm_sw()->launch_request_loop();
+
+    // Initialize loop flags.
+    atomicStoreRelaxed(impl_->flag_->ref<int>(), 0);
+    GpuKernel::launch(impl_->stream_);
+    if (!disable_timing) {
+        impl_->timer_end_->record(impl_->stream_);
+        is_recording_ = true;
+    }
+    is_launched_ = true;
+}
+
+void GpuLoopKernel::run(int iter) {
+    if (iter > 0) {
+        while (atomicLoadRelaxed(impl_->flag_->ref<int>()) > 0) {
+        }
+        atomicStoreRelaxed(impl_->flag_->ref<int>(), iter);
+    }
+}
+
+bool GpuLoopKernel::poll() {
+    return atomicLoadRelaxed(impl_->flag_->ref<int>()) <= 0;
+}
+
+void GpuLoopKernel::wait() {
+    int cnt = MAX_LOOP_COUNTER;
+    while (atomicLoadRelaxed(impl_->flag_->ref<int>()) > 0) {
+        if (--cnt > 0) {
+            continue;
+        }
+        // Check if the kernel encountered an error.
+        gpuError res = impl_->stream_->query();
+        if (res == gpuSuccess) {
+            if (atomicLoadRelaxed(impl_->flag_->ref<int>()) > 0) {
+                LOG(WARN, "Stream is finished but the loop flag is still set.");
+                break;
+            } else {
+                LOG(WARN,
+                    "wait() is delayed by a stream query. Regarding "
+                    "timing measurements may be inaccurate.");
+                break;
+            }
+        } else if (res == gpuErrorNotReady) {
+            cnt = MAX_LOOP_COUNTER;
+        } else {
+            GLOG(res);
+        }
+    }
+}
+
+void GpuLoopKernel::stop() {
+    wait();
+    atomicStoreRelaxed(impl_->flag_->ref<int>(), -1);
+    impl_->stream_->sync();
+    if (is_recording_) {
+        elapsed_msec_ = impl_->timer_end_->elapsed_msec(*impl_->timer_begin_);
+        is_recording_ = false;
+    }
+    is_launched_ = false;
+    // ctx_->get_comm_sw()->stop_request_loop();
+}
+
+float GpuLoopKernel::get_elapsed_msec() const {
+    if (is_recording_) {
+        ERR(InvalidUsageError, "Need to stop the kernel first.");
+    }
+    return elapsed_msec_;
+}
+
+}  // namespace ark
diff --git a/ark/gpu/gpu_loop_kernel.h b/ark/gpu/gpu_loop_kernel.h
index 514dfa746..ec9beb626 100644
--- a/ark/gpu/gpu_loop_kernel.h
+++ b/ark/gpu/gpu_loop_kernel.h
@@ -15,11 +15,10 @@ namespace ark {
 
 class GpuLoopKernel : public GpuKernel {
    public:
-    GpuLoopKernel(std::shared_ptr<GpuContext> ctx, const std::string &name,
-                  const std::vector<std::string> &codes, int num_sm,
-                  int num_warp, unsigned int smem_bytes);
+    GpuLoopKernel(int gpu_id, const std::string &plan, const std::string &name,
+                  size_t smem_bytes);
 
-    void launch(std::shared_ptr<GpuStream> stream, bool disable_timing = true);
+    void launch(bool disable_timing = true);
     void load();
     void run(int iter = 1);
     bool poll();
@@ -29,13 +28,10 @@ class GpuLoopKernel : public GpuKernel {
     float get_elapsed_msec() const;
 
    private:
-    std::shared_ptr<GpuEvent> timer_begin_;
-    std::shared_ptr<GpuEvent> timer_end_;
+    class Impl;
+    std::shared_ptr<Impl> impl_;
 
-    int threads_per_warp_ = -1;
-    std::shared_ptr<GpuHostMemory> flag_ = nullptr;
-
-    std::shared_ptr<GpuStream> stream_ = nullptr;
+    bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 };
diff --git a/ark/gpu/gpu_manager.cc b/ark/gpu/gpu_manager.cpp
similarity index 100%
rename from ark/gpu/gpu_manager.cc
rename to ark/gpu/gpu_manager.cpp
diff --git a/ark/gpu/gpu_memory.cc b/ark/gpu/gpu_memory.cpp
similarity index 100%
rename from ark/gpu/gpu_memory.cc
rename to ark/gpu/gpu_memory.cpp
diff --git a/ark/gpu/gpu_stream.cc b/ark/gpu/gpu_stream.cpp
similarity index 100%
rename from ark/gpu/gpu_stream.cc
rename to ark/gpu/gpu_stream.cpp
diff --git a/ark/half.cc b/ark/half.cpp
similarity index 100%
rename from ark/half.cc
rename to ark/half.cpp
diff --git a/ark/half.h b/ark/half.h
index 6be1a29b1..1820bb3d3 100644
--- a/ark/half.h
+++ b/ark/half.h
@@ -234,6 +234,8 @@ struct alignas(2) half_t {
     int mantissa() const { return int(storage & 0x3ff); }
 };
 
+using fp16 = half_t;
+
 /// Assignment from half_t
 template <>
 half_t& half_t::operator=(half_t const& x);
diff --git a/ark/half_test.cc b/ark/half_test.cpp
similarity index 99%
rename from ark/half_test.cc
rename to ark/half_test.cpp
index a94c21759..2a73a942d 100644
--- a/ark/half_test.cc
+++ b/ark/half_test.cpp
@@ -3,7 +3,6 @@
 
 #include "half.h"
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_half() {
@@ -245,7 +244,6 @@ ark::unittest::State test_half_error() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_half);
     UNITTEST(test_half_error);
     return 0;
diff --git a/ark/include/ark.h b/ark/include/ark.hpp
similarity index 85%
rename from ark/include/ark.h
rename to ark/include/ark.hpp
index 9b77122e4..cc096c457 100644
--- a/ark/include/ark.h
+++ b/ark/include/ark.hpp
@@ -11,10 +11,10 @@
 #define ARK_PATCH 0
 #define ARK_VERSION (ARK_MAJOR * 10000 + ARK_MINOR * 100 + ARK_PATCH)
 
-#include "ark/dims.h"
-#include "ark/error.h"
-#include "ark/executor.h"
-#include "ark/model.h"
+#include "ark/dims.hpp"
+#include "ark/error.hpp"
+// #include "ark/executor.hpp"
+#include "ark/model.hpp"
 
 namespace ark {
 
diff --git a/ark/include/ark/dims.h b/ark/include/ark/dims.hpp
similarity index 76%
rename from ark/include/ark/dims.h
rename to ark/include/ark/dims.hpp
index c15d7537e..ffda8a649 100644
--- a/ark/include/ark/dims.h
+++ b/ark/include/ark/dims.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_DIMS_H
-#define ARK_DIMS_H
+#ifndef ARK_DIMS_HPP
+#define ARK_DIMS_HPP
 
 #include <ostream>
 #include <string>
@@ -11,16 +11,17 @@
 namespace ark {
 
 // Data type for dimension.
-typedef long long int DimType;
+typedef int64_t DimType;
 
 // DIMS_LEN is the maximum number of dimensions of a tensor. If a tensor
 // has less than DIMS_LEN dimensions, the remaining dimensions will be NO_DIM.
-enum { DIMS_LEN = 4, NO_DIM = -1 };
+constexpr DimType NO_DIM = -1;
+constexpr DimType DIMS_LEN = 4;
 
 // Up-to-`DIMS_LEN`-dimensional vector.
 class Dims {
    private:
-    DimType data[DIMS_LEN];
+    DimType data_[DIMS_LEN];
 
    public:
     // Construct with given four dimensions.
@@ -39,16 +40,22 @@ class Dims {
     int ndims() const;
     // Return a new Dims object with 4 valid dimensions by prepending 1s.
     Dims dims4() const;
+    // Return true if all valid dimensions are zero.
+    bool is_zeros() const;
     // Return true if the dimensions are empty.
     bool is_no_dim() const;
     // Return true if the dimensions are invalid.
     bool is_invalid() const;
+    // Return a vector of valid dimensions.
+    std::vector<DimType> vector() const;
     // Insert a dimension at the given index.
     void insert(int idx, DimType dim);
     // Erase the dimension at the given index and return the erased dimension.
     DimType erase(int idx);
 
-    std::string serialize() const;
+    std::string serialize(int indent = -1) const;
+
+    static Dims deserialize(const std::string &serialized);
 
     DimType &operator[](int idx);
 
@@ -58,10 +65,10 @@ class Dims {
 
     friend bool operator==(const Dims &a, const Dims &b);
     friend bool operator!=(const Dims &a, const Dims &b);
-
-    friend std::ostream &operator<<(std::ostream &os, const Dims &dims);
 };
 
+std::ostream &operator<<(std::ostream &os, const Dims &dims);
+
 }  // namespace ark
 
-#endif  // ARK_DIMS_H
+#endif  // ARK_DIMS_HPP
diff --git a/ark/include/ark/error.h b/ark/include/ark/error.h
deleted file mode 100644
index 2326c47f4..000000000
--- a/ark/include/ark/error.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_ERROR_H
-#define ARK_ERROR_H
-
-#include <stdexcept>
-#include <string>
-
-namespace ark {
-
-class InvalidUsageError : public std::runtime_error {
-   public:
-    InvalidUsageError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class ModelError : public std::runtime_error {
-   public:
-    ModelError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class SchedulerError : public std::runtime_error {
-   public:
-    SchedulerError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class ExecutorError : public std::runtime_error {
-   public:
-    ExecutorError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class SystemError : public std::runtime_error {
-   public:
-    SystemError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class GpuError : public std::runtime_error {
-   public:
-    GpuError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class RuntimeError : public std::runtime_error {
-   public:
-    RuntimeError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-class UnitTestError : public std::runtime_error {
-   public:
-    UnitTestError(const std::string &msg) : std::runtime_error(msg) {}
-};
-
-}  // namespace ark
-
-#endif  // ARK_ERROR_H
diff --git a/ark/include/ark/executor.h b/ark/include/ark/executor.hpp
similarity index 76%
rename from ark/include/ark/executor.h
rename to ark/include/ark/executor.hpp
index 4bfa3a9dc..e0419ad83 100644
--- a/ark/include/ark/executor.h
+++ b/ark/include/ark/executor.hpp
@@ -1,10 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_EXECUTOR_H
-#define ARK_EXECUTOR_H
+#ifndef ARK_EXECUTOR_HPP
+#define ARK_EXECUTOR_HPP
 
-#include "model.h"
+#include <memory>
+#include <string>
 
 namespace ark {
 
@@ -12,18 +13,24 @@ namespace ark {
 class Executor {
    public:
     /// Constructor.
-    Executor(int rank, int world_size, Model &model, const std::string &name,
-             int num_warps_per_sm = 16);
+    Executor(int rank, int world_size, const std::string &plan,
+             const std::string &name = "DefaultExecutor");
+
     ~Executor();
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
+
     /// Launch the model (not running yet). This must be called after
     /// `compile()`.
     void launch();
+
     /// Run the model for `iter` iterations.
     void run(int iter);
+
     /// Wait for the previous run to finish.
     void wait();
+
     /// Stop the model and return the elapsed time in milliseconds.
     /// Once this is called, we need to call `launch()` again to run the model
     /// again.
@@ -36,4 +43,4 @@ class Executor {
 
 }  // namespace ark
 
-#endif  // ARK_EXECUTOR_H
+#endif  // ARK_EXECUTOR_HPP
diff --git a/ark/include/ark/init.hpp b/ark/include/ark/init.hpp
new file mode 100644
index 000000000..00382f747
--- /dev/null
+++ b/ark/include/ark/init.hpp
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_INIT_HPP
+#define ARK_INIT_HPP
+
+namespace ark {
+
+/// Initialize the ARK runtime.
+///
+/// This function should be called by the user before any other functions are
+/// called. It is safe to call this function multiple times.
+void init();
+
+}  // namespace ark
+
+#endif  // ARK_INIT_HPP
diff --git a/ark/include/ark/model.h b/ark/include/ark/model.h
deleted file mode 100644
index a3bbc08cd..000000000
--- a/ark/include/ark/model.h
+++ /dev/null
@@ -1,1125 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_MODEL_H
-#define ARK_MODEL_H
-
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-
-#include "dims.h"
-
-namespace ark {
-
-class Tensor;
-class CodeGenerator;
-class BaseScheduler;
-class SchedOp;
-class Model;
-
-/// Type of tensor data.
-class TensorType {
-   private:
-    const std::string name_;
-    const int bytes_;
-    const std::string type_str_;
-
-   public:
-    TensorType(const std::string &name = "none", int bytes = 0,
-               const std::string &type_str = "void *");
-
-    bool operator==(const TensorType &other) const;
-    bool operator!=(const TensorType &other) const;
-
-    int bytes() const;
-    const std::string &name() const;
-    const std::string &type_str() const;
-};
-
-const TensorType NONE;
-
-std::ostream &operator<<(std::ostream &os, const TensorType &type);
-
-#define REGISTER_TENSOR_TYPE(_type_name, _bytes, _type_str) \
-    class TensorType_##_type_name : public TensorType {     \
-       public:                                              \
-        TensorType_##_type_name()                           \
-            : TensorType{#_type_name, _bytes, _type_str} {} \
-    };                                                      \
-    const TensorType_##_type_name _type_name;
-
-REGISTER_TENSOR_TYPE(FP32, 4, "fp32")
-REGISTER_TENSOR_TYPE(FP16, 2, "fp16")
-REGISTER_TENSOR_TYPE(BF16, 2, "bf16")
-REGISTER_TENSOR_TYPE(INT32, 4, "i32")
-REGISTER_TENSOR_TYPE(UINT32, 4, "ui32")
-REGISTER_TENSOR_TYPE(INT8, 1, "i8")
-REGISTER_TENSOR_TYPE(UINT8, 1, "ui8")
-REGISTER_TENSOR_TYPE(BYTE, 1, "unsigned char")
-
-class GpuBuffer;
-// TensorBuf refers to a data array that can be shared by multiple tensors.
-class TensorBuf {
-   public:
-    TensorBuf(const DimType &bytes = 0, int id = -1);
-    TensorBuf(const TensorBuf &) = default;
-
-    size_t get_buf_offset() const;
-
-    DimType bytes;
-    int id;
-    bool immutable = true;
-
-   protected:
-    std::shared_ptr<GpuBuffer> buf = nullptr;
-
-    friend class Tensor;
-    friend class DefaultScheduler;
-};
-
-/// Tensor is a view of a TensorBuf.
-///
-/// Illustration of a single axis of a tensor:
-///
-/// 0           off                                                        ldim
-/// |------------|-------------shape-------------|---------------------------|
-///       ^       <----------------------------->                ^
-///       |          data range of this tensor                   |
-///       +------------------------------------------+-----------+
-///                                                  |
-///                                        We call these "padding".
-///
-class Tensor {
-   public:
-    /// Tensor constructor.
-    Tensor(const Dims &shape, const TensorType &type, TensorBuf *buf,
-           const Dims &ldims, const Dims &offs, const Dims &pads, bool exported,
-           int imported_rank, int id, const std::string &name);
-    Tensor(const Tensor &) = default;
-
-    /// Copy contiguous data from a host buffer to the given tensor's (possibly
-    /// non-contiguous) data range.
-    ///
-    /// For example, say the tensor is a 2D float tensor with shape [2, 3],
-    /// ldims [2, 4], offs [0, 0], and pads [1, 1], then the data in the host
-    /// buffer is 0, 1, ..., 5. After writing, the data in the tensor will be:
-    ///
-    ///     [[0, 1, 2, ?],
-    ///      [3, 4, 5, ?]]
-    ///
-    /// where ? means the original unmodified value.
-    ///
-    /// @param buf The host buffer to copy from. The buffer must be large enough
-    /// to hold the data.
-    ///
-    void write(const void *buf);
-
-    /// Copy (possibly non-contiguous) data from a tensor on GPU to a contiguous
-    /// host buffer.
-    ///
-    /// The given number of bytes is copied, in order of appearance on the
-    /// memory. This function assumes that @p buf is large enough to hold the
-    /// data. For example, say the tensor is a 2D float tensor with shape [2,
-    /// 3], ldims [2, 4], offs [0, 0], and pads [1, 1], then the data in the
-    /// tensor is:
-    ///
-    ///     [[0, 1, 2, 3],
-    ///      [4, 5, 6, 7]]
-    ///
-    /// After read, the data in the host buffer will be 0, 1, 2, 4, 5, 6.
-    ///
-    /// @param buf The host buffer to copy to. The buffer must be large enough
-    /// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
-    /// @return The host buffer that holds the data.
-    ///
-    void *read(void *buf = nullptr);
-
-    /// Copy all the underlying buffer data (including padding) to a contiguous
-    /// host buffer.
-    ///
-    /// This function is mainly for debugging purposes.
-    ///
-    /// @param buf The host buffer to copy to. The buffer must be large enough
-    /// to hold the data. If @p buf is nullptr, a new buffer will be allocated.
-    /// @return The host buffer that holds the data.
-    ///
-    void *read_raw(void *buf = nullptr);
-
-    /// Set all bytes of the tensor buffer to 0.
-    void clear();
-
-    /// Offset to the element [i0][i1][i2][i3] of this tensor in the TensorBuf.
-    /// @param i0, i1, i2, i3 The indices of the element.
-    /// @return The offset in the number of elements.
-    DimType offset(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0,
-                   DimType i3 = 0) const;
-
-    /// Number of elements in the tensor excluding padding.
-    /// @return The number of elements.
-    DimType size() const;
-
-    /// Number of dimensions of the tensor.
-    /// @return The number of dimensions.
-    int ndims() const;
-
-    /// Number of bytes of each element in the tensor.
-    /// @return The number of bytes.
-    int type_bytes() const;
-
-    /// Number of bytes in the tensor's data range.
-    /// @return The number of bytes.
-    DimType shape_bytes() const;
-
-    /// Equivalent as the number of bytes of the underlying @ref TensorBuf.
-    /// @return The number of bytes.
-    DimType ldims_bytes() const;
-
-    /// Offset in bytes.
-    /// @param i0, i1, i2, i3 The indices of the element.
-    /// @return The offset in bytes.
-    DimType offset_bytes(DimType i0 = 0, DimType i1 = 0, DimType i2 = 0,
-                         DimType i3 = 0) const;
-
-    /// Checks if the tensor has the actually memory allocated.
-    /// @return True if the tensor has the memory allocated.
-    bool is_alloced() const;
-
-    /// Checks if the tensor's data range is sequential in memory.
-    /// @return True if the tensor is sequential in memory.
-    bool is_sequential() const;
-
-    /// TensorBuf that this tensor is associated with
-    TensorBuf *buf;
-    /// Data type of each element in the tensor
-    TensorType type;
-    /// Shape of the tensor
-    Dims shape;
-    /// Leading dimensions of the underlying data array
-    Dims ldims;
-    /// Offset of the tensor in the underlying data array
-    Dims offs;
-    /// Unit dimensions of the underlying data array. ldims[x] should be always
-    /// divided by pads[x].
-    Dims pads;
-    /// Whether this tensor is local and accessed by remote devices.
-    bool exported;
-    /// If `imported_rank` is non-negative, the tensor is imported from another
-    /// rank and don't need to allocate a TensorBuf for it.
-    int imported_rank;
-    /// Unique id of this tensor
-    int id;
-    /// Name of this tensor
-    const std::string name;
-
-   protected:
-    bool update_pads(const Dims &tile, const Tensor *ref_tensor = nullptr,
-                     const Dims &ref_orig_ldims = {});
-
-    friend class DefaultScheduler;
-    friend class SchedOp;
-};
-
-/// Type of operator argument.
-struct OpArgType {
-    OpArgType(size_t id, const std::string &name) : id(id), name(name) {}
-    size_t id;
-    std::string name;
-};
-
-bool operator==(const OpArgType &lhs, const OpArgType &rhs);
-
-bool operator!=(const OpArgType &lhs, const OpArgType &rhs);
-
-std::ostream &operator<<(std::ostream &os, const OpArgType &type);
-
-const OpArgType OP_ARG_INT(0, "int");
-const OpArgType OP_ARG_INT64(1, "int64");
-const OpArgType OP_ARG_UINT64(2, "uint64");
-const OpArgType OP_ARG_BOOL(3, "bool");
-const OpArgType OP_ARG_FLOAT(4, "float");
-const OpArgType OP_ARG_DIMS(5, "dims");
-const OpArgType OP_ARG_TENSOR(6, "tensor");
-
-/// Stores an arbitrary type of argument given to an operator.
-struct OpArg {
-    OpArg(int arg);
-    OpArg(long long int arg);
-    OpArg(uint64_t arg);
-    OpArg(bool arg);
-    OpArg(float arg);
-    OpArg(const Dims &arg);
-    OpArg(Tensor *arg);
-    OpArg(const OpArg &);
-    ~OpArg();
-
-    void get(int *arg) const;
-    void get(long long int *arg) const;
-    void get(uint64_t *arg) const;
-    void get(bool *arg) const;
-    void get(float *arg) const;
-    void get(Dims *arg) const;
-    void get(Tensor **arg) const;
-
-    OpArgType type;
-    void *val;
-
-    friend bool operator<(const OpArg &oa1, const OpArg &oa2);
-    friend bool operator==(const OpArg &oa1, const OpArg &oa2);
-};
-
-/// Stores a list of @ref OpArg.
-class OpArgs {
-   public:
-    OpArgs(const std::vector<OpArg> &args = {});
-    OpArgs(const OpArgs &) = default;
-    ~OpArgs(){};
-
-    OpArgs &operator=(const OpArgs &opargs);
-
-    void put(const OpArg &arg);
-
-    void get(int *arg, size_t idx) const;
-    void get(long long int *arg, size_t idx) const;
-    void get(uint64_t *arg, size_t idx) const;
-    void get(bool *arg, size_t idx) const;
-    void get(float *arg, size_t idx) const;
-    void get(Dims *arg, size_t idx) const;
-    void get(Tensor **arg, size_t idx) const;
-
-    const std::vector<OpArg> &get_args() const;
-
-   protected:
-    std::vector<OpArg> args;
-
-    friend class Op;
-    friend bool operator<(const OpArgs &opargs1, const OpArgs &opargs2);
-    friend bool operator==(const OpArgs &opargs1, const OpArgs &opargs2);
-    friend bool operator!=(const OpArgs &opargs1, const OpArgs &opargs2);
-};
-
-/// Type of @ref Op.
-struct OpType {
-    OpType(size_t id, const std::string &name) : id(id), name(name) {}
-    const size_t id;
-    std::string name;
-};
-
-bool operator==(const OpType &lhs, const OpType &rhs);
-
-const OpType OP_UNKNOWN(0, "unknown");
-const OpType OP_TENSOR(1, "tensor");
-const OpType OP_REFER(2, "refer");
-const OpType OP_RESHAPE(3, "reshape");
-const OpType OP_MERGE(4, "merge");
-const OpType OP_REDUCE_E_SUM(5, "reduce_e_sum");
-const OpType OP_REDUCE_E_MEAN(6, "reduce_e_mean");
-const OpType OP_REDUCE_E_MAX(7, "reduce_e_max");
-const OpType OP_REDUCE_W_SUM(8, "reduce_w_sum");
-const OpType OP_REDUCE_W_MEAN(9, "reduce_w_mean");
-const OpType OP_REDUCE_W_MAX(10, "reduce_w_max");
-const OpType OP_LAYERNORM(11, "layernorm");
-const OpType OP_SCALE(12, "scale");
-const OpType OP_RELU(13, "relu");
-const OpType OP_COPY(14, "copy");
-const OpType OP_GELU(15, "gelu");
-const OpType OP_SIGMOID(16, "sigmoid");
-const OpType OP_EXP(17, "exp");
-const OpType OP_SQRT(18, "sqrt");
-const OpType OP_RSQRT(19, "rsqrt");
-const OpType OP_MATMUL(20, "matmul");
-const OpType OP_MAX_POOL(21, "max_pool");
-const OpType OP_ADD(22, "add");
-const OpType OP_SUB(23, "sub");
-const OpType OP_MUL(24, "mul");
-const OpType OP_DIV(25, "div");
-const OpType OP_ROPE(26, "rope");
-const OpType OP_IM2COL(27, "im2col");
-const OpType OP_TRANSPOSE(28, "transpose");
-const OpType OP_SEND(29, "send");
-const OpType OP_SEND_DONE(30, "send_done");
-const OpType OP_RECV(31, "recv");
-const OpType OP_EMBEDDING(32, "embedding");
-const OpType OP_DEVICE_SYNC(33, "device_sync");
-const OpType OP_READ_AND_REDUCE(34, "read_and_reduce");
-const OpType OP_GATHER_FROM_PEERS(35, "gather_from_peers");
-const OpType OP_CAST(36, "cast");
-const OpType OP_PUT_PACKET(37, "put_packet");
-const OpType OP_REDUCE_AND_WRITE_PACKET(38, "reduce_and_write_packet");
-const OpType OP_GET_FROM_PACKET(39, "get_from_packet");
-
-/// Type of hardware architecture support.
-typedef enum {
-    OP_ARCH_UNKNOWN = 0,
-    OP_ARCH_CUDA_60 = 0x1,
-    OP_ARCH_CUDA_70 = 0x2,
-    OP_ARCH_CUDA_80 = 0x4,
-    OP_ARCH_CUDA_90 = 0x8,
-    OP_ARCH_CUDA_ANY = 0x0f,
-    OP_ARCH_ROCM_90A = 0x10,
-    OP_ARCH_ROCM_942 = 0x20,
-    OP_ARCH_ROCM_ANY = 0xf0,
-    OP_ARCH_ANY = -1,
-} OpArchType;
-
-OpArchType op_arch_from_string(const std::string &arch);
-
-/// 2-dimensional op tile
-struct OpTile {
-    DimType x;
-    DimType y;
-};
-
-/// Configurations for execution of a @ref Op.
-struct OpConfig {
-    int num_warps = 0;
-    int smem_bytes = 0;
-    std::vector<OpTile> input_tiles;
-    std::vector<OpTile> output_tiles;
-    bool sync_pre = false;
-    bool sync_post = false;
-};
-
-/// Key to find a list of OpConfigs from OpConfigMap.
-struct OpConfigKey {
-    OpArchType arch_type;
-    std::string prec_type;
-};
-
-bool operator<(const OpConfigKey &ops1, const OpConfigKey &ops2);
-
-bool operator==(const OpConfigKey &ops1, const OpConfigKey &ops2);
-
-/// Map from OpConfigKey to a list of OpConfigs.
-class OpConfigMap {
-   public:
-    OpConfigMap(std::initializer_list<
-                std::pair<const OpConfigKey, const std::vector<OpConfig>>>
-                    ilist);
-    ~OpConfigMap(){};
-
-    const std::vector<OpConfig> &get(const OpConfigKey &key) const;
-
-   private:
-    const std::map<OpConfigKey, const std::vector<OpConfig>> cfg_map;
-};
-
-/// Operator.
-class Op {
-   public:
-    /// Construct an operator.
-    Op() = default;
-
-    /// Construct an operator.
-    /// @param type the type of the @ref Op.
-    /// @param prec_type the precision type of the @ref Op.
-    /// @param inputs the input tensors of the @ref Op, including execution
-    /// dependencies.
-    /// @param output_refs the output reference tensors of the @ref Op. Output
-    /// tensors are created based on these references.
-    /// @param args the arguments of the @ref Op.
-    /// @param name the name of the @ref Op.
-    /// @param cfg_map the configuration map of the @ref Op
-    /// @param gran_lev the granularity level of the @ref Op. Larger values
-    /// should indicate finer-grained Ops. If it is -1, the granularity level
-    /// will be automatically determined by the scheduler.
-    /// @param force_inline whether to force inline the kernel of @ref Op.
-    Op(const OpType &type, const std::string &prec_type,
-       const std::vector<Tensor *> &inputs,
-       const std::vector<Tensor *> &output_refs, const OpArgs &args,
-       const std::string &name, const OpConfigMap *cfg_map = nullptr,
-       int gran_lev = -1, bool force_inline = false);
-
-    /// Construct an operator.
-    Op(const Op &) = default;
-
-    /// Destruct the operator.
-    ~Op(){};
-
-    /// Return the kernel function name of the operator. Includes the template
-    /// arguments of the kernel, if any.
-    /// @param cfg the configuration of the operator.
-    /// @return the kernel function name of the operator.
-    std::string function_name(const OpConfig &) const;
-
-    /// Return the kernel function's runtime arguments of the operator.
-    /// @param cfg the configuration of the operator.
-    /// @return the runtime arguments of the kernel function.
-    OpArgs function_call_args(const OpConfig &) const;
-
-    /// Returns true if the operator is virtual (i.e., performs no computation).
-    bool is_virtual() const;
-
-    /// Returns true if the operator is a communication operator.
-    bool is_comm() const;
-
-    /// Type of the operator.
-    OpType type;
-    /// Precision type of the operator.
-    std::string prec_type;
-    /// The input tensors of the operator.
-    std::vector<Tensor *> inputs;
-    /// The output tensors of the operator.
-    std::vector<Tensor *> outputs;
-    /// The reference tensors of the output tensors.
-    std::vector<Tensor *> output_refs;
-    /// Additional arguments of the operator.
-    OpArgs args;
-    /// Name of the operator.
-    std::string name;
-    /// Map from OpConfigKey to a list of OpConfigs.
-    const OpConfigMap *cfg_map;
-    /// Granularity level of the operator.
-    int gran_lev;
-    /// Force inlining of the operator kernel.
-    bool force_inline;
-
-    friend bool operator<(const Op &op1, const Op &op2);
-    friend bool operator==(const Op &op1, const Op &op2);
-
-   protected:
-    static std::string function_name(const std::string &kernel_name,
-                                     const OpArgs &template_args);
-};
-
-/// List all operator classes below.
-
-class ArithmeticOp : public Op {
-   public:
-    ArithmeticOp(const OpType &type, const std::string &prec_type,
-                 Tensor *input, Tensor *other, Tensor *output,
-                 const std::string &name);
-
-   protected:
-    std::string function_name(const OpConfig &cfg,
-                              const std::string &type) const;
-};
-
-class AddOp : public ArithmeticOp {
-   public:
-    AddOp(const std::string &prec_type, Tensor *input, Tensor *other,
-          Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class SubOp : public ArithmeticOp {
-   public:
-    SubOp(const std::string &prec_type, Tensor *input, Tensor *other,
-          Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class MulOp : public ArithmeticOp {
-   public:
-    MulOp(const std::string &prec_type, Tensor *input, Tensor *other,
-          Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class DivOp : public ArithmeticOp {
-   public:
-    DivOp(const std::string &prec_type, Tensor *input, Tensor *other,
-          Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class MathOp : public Op {
-   public:
-    MathOp(const OpType &type, const std::string &prec_type, Tensor *input,
-           Tensor *output, const std::string &name);
-
-   protected:
-    std::string function_name(const OpConfig &cfg,
-                              const std::string &type) const;
-};
-
-class GeluOp : public MathOp {
-   public:
-    GeluOp(const std::string &prec_type, Tensor *input, Tensor *output,
-           const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ExpOp : public MathOp {
-   public:
-    ExpOp(const std::string &prec_type, Tensor *input, Tensor *output,
-          const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReluOp : public MathOp {
-   public:
-    ReluOp(const std::string &prec_type, Tensor *input, Tensor *output,
-           const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class RsqrtOp : public MathOp {
-   public:
-    RsqrtOp(const std::string &prec_type, Tensor *input, Tensor *output,
-            const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class SigmoidOp : public MathOp {
-   public:
-    SigmoidOp(const std::string &prec_type, Tensor *input, Tensor *output,
-              const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class SqrtOp : public MathOp {
-   public:
-    SqrtOp(const std::string &prec_type, Tensor *input, Tensor *output,
-           const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class RopeOp : public Op {
-   public:
-    RopeOp(const std::string &prec_type, Tensor *input, Tensor *other,
-           Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class Im2colOp : public Op {
-   public:
-    Im2colOp(const std::string &prec_type, Tensor *input, Tensor *output,
-             int kernel_height, int kernel_width, int stride_height,
-             int stride_width, int pad_height, int pad_width,
-             int dilation_height, int dilation_width, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class LayernormOp : public Op {
-   public:
-    LayernormOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class MatmulOp : public Op {
-   public:
-    MatmulOp(const std::string &prec_type, Tensor *mat_a, Tensor *mat_b,
-             Tensor *mat_y, Dims nca, Dims ncb, Dims problem_size,
-             Dims leading_dims, bool is_column_a, bool is_column_b,
-             const std::string &name, int gran_lev);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class MaxPoolOp : public Op {
-   public:
-    MaxPoolOp(const std::string &prec_type, Tensor *input, Tensor *output,
-              DimType kernel_size, DimType stride, const std::string &name);
-};
-
-class ReduceOp : public Op {
-   public:
-    ReduceOp(const OpType &type, const std::string &prec_type,
-             const std::vector<Tensor *> &inputs,
-             const std::vector<Tensor *> &outputs, const OpArgs &args,
-             const std::string &name, const OpConfigMap *cfg_map, int gran_lev);
-
-   protected:
-    std::string function_name(const OpConfig &cfg,
-                              const std::string &type) const;
-};
-
-class ReduceWSumOp : public ReduceOp {
-   public:
-    ReduceWSumOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                 int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReduceESumOp : public ReduceOp {
-   public:
-    ReduceESumOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                 int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReduceWMaxOp : public ReduceOp {
-   public:
-    ReduceWMaxOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                 int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReduceEMaxOp : public ReduceOp {
-   public:
-    ReduceEMaxOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                 int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReduceWMeanOp : public ReduceOp {
-   public:
-    ReduceWMeanOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                  int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReduceEMeanOp : public ReduceOp {
-   public:
-    ReduceEMeanOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                  int axis, bool keepdims, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class CopyOp : public Op {
-   public:
-    CopyOp(const std::string &prec_type, Tensor *input, Tensor *output,
-           const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class ReshapeOp : public Op {
-   public:
-    ReshapeOp(const std::string &prec_type, Tensor *input, Tensor *output,
-              const std::string &name);
-};
-
-class ScaleOp : public Op {
-   public:
-    ScaleOp(const std::string &prec_type, Tensor *input, Tensor *output,
-            float val, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &) const;
-};
-
-class SendOp : public Op {
-   public:
-    SendOp(const std::string &prec_type, Tensor *input, Tensor *recvbuf,
-           int sid, int rank, int dst_rank, size_t bytes,
-           const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    // The args determined by the scheduler.
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class RecvOp : public Op {
-   public:
-    RecvOp(const std::string &prec_type, Tensor *output, int sid, int rank,
-           int src_rank, size_t bytes, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class SendDoneOp : public Op {
-   public:
-    SendDoneOp(const std::string &prec_type, Tensor *input, int rank,
-               int dst_rank, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class DeviceSyncOp : public Op {
-   public:
-    DeviceSyncOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                 int nranks, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class ReadAndReduceOp : public Op {
-   public:
-    ReadAndReduceOp(const std::string &prec_type, Tensor *local_buf,
-                    Tensor *cal_region_local, std::vector<Tensor *> remote_bufs,
-                    int sid, int rank, int npeers, size_t offset, size_t bytes,
-                    const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class GatherFromPeersOp : public Op {
-   public:
-    GatherFromPeersOp(const std::string &prec_type, Tensor *local_buf,
-                      Tensor *trans_region_local,
-                      std::vector<Tensor *> remote_bufs, int sid, int rank,
-                      int npeers, size_t stride, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class PutPacketOp : public Op {
-   public:
-    PutPacketOp(const std::string &prec_type, Tensor *input,
-                Tensor *local_tmp_buf, Tensor *recv_buf, int id, int rank,
-                int dst_rank, size_t dst_offset, int flag,
-                const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class ReduceAndWritePacketOp : public Op {
-   public:
-    ReduceAndWritePacketOp(const std::string &prec_type,
-                           std::vector<Tensor *> inputs, Tensor *output, int id,
-                           int rank, int npeers, size_t elems_per_rank,
-                           size_t scratch_offset, size_t remote_dst_offset,
-                           int flag, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class GetFromPacketOp : public Op {
-   public:
-    GetFromPacketOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                    size_t src_offset, size_t dst_offset, size_t npackets,
-                    int flag, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-    OpArgs function_call_args(const OpConfig &cfg) const;
-};
-
-class TensorOp : public Op {
-   public:
-    TensorOp(const std::vector<Tensor *> &deps, Tensor *output,
-             const std::string &name);
-};
-
-class TransposeOp : public Op {
-   public:
-    TransposeOp(const std::string &prec_type, Tensor *input, Tensor *output,
-                int tp_type, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class EmbeddingOp : public Op {
-   public:
-    EmbeddingOp(const std::string &prec_type, Tensor *input, Tensor *weight,
-                Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-class CastOp : public Op {
-   public:
-    CastOp(Tensor *input, Tensor *output, const std::string &name);
-    std::string function_name(const OpConfig &cfg) const;
-};
-
-/// A node of @ref Model.
-class OpNode {
-   public:
-    /// Construct an empty @ref OpNode.
-    OpNode(){};
-
-    /// Destruct an @ref OpNode.
-    ~OpNode(){};
-
-    /// The list of @ref Op that this @ref OpNode contains. Sorted in the
-    /// execution order.
-    std::vector<Op *> ops;
-
-    /// The list of @ref OpNode that depends on this @ref OpNode.
-    std::set<OpNode *> users;
-
-    /// The list of @ref OpNode that this @ref OpNode depends on.
-    std::set<OpNode *> producers;
-
-    /// Remove this @ref OpNode from the graph.
-    void remove_self();
-
-    /// Get the name of this @ref OpNode.
-    std::string get_name() const;
-};
-
-class Model {
-   public:
-    // Constructors.
-    Model(int rank_ = 0);
-    Model(const Model &) = delete;
-    Model &operator=(const Model &) = delete;
-
-    ~Model();
-
-    /// Verify if this model is valid.
-    /// @return true if the model is valid, false otherwise.
-    bool verify() const;
-
-    void create_nodes();
-    void clear_nodes();
-
-    /// Get the @ref OpNode list.
-    /// @return The @ref OpNode list.
-    const std::list<std::unique_ptr<OpNode>> &get_nodes() const;
-
-    /// Break a @ref OpNode into two @ref OpNode.
-    ///
-    /// The original node will have the first @p op_idx ops, and the new node
-    /// will have the rest.
-    ///
-    /// @param node The @ref OpNode to break.
-    /// @param op_idx The index of the first op in the new @ref OpNode.
-    /// @return The new @ref OpNode.
-    OpNode *break_node(OpNode *node, int op_idx);
-
-    /// Check dependencies between two @ref OpNode.
-    ///
-    /// @param node1 The first @ref OpNode.
-    /// @param node2 The second @ref OpNode.
-    /// @return True if @p node1 depends on @p node2.
-    bool depends_on(OpNode *node1, OpNode *node2) const;
-
-    std::string serialize(int indent = -1) const;
-
-    /// Returns a tensor object.
-    ///
-    /// @param shape Shape of the tensor, where the data of interest is.
-    /// @param ttype Type of the tensor data.
-    /// @param buf The @ref TensorBuf that holds the entire data including the
-    /// padding.
-    /// @param ldims Leading dimensions (ldim) of the tensor, which may be
-    /// different from the shape. @p ldims can be considered as the actual shape
-    /// of the underlying data buffer (@ref TensorBuf).
-    /// @param offs Offsets of the tensor. The data of interest starts at
-    /// @p offs and ends at @p offs + @p shape.
-    /// @param pads If a dimension of @p pads is set to larger than 1, the
-    /// corresponding ldim will be set to the minimum multiple of @p pads that
-    /// is larger than or equal to the previous ldim. Padding is accumulated
-    /// across all tensors that share the same @ref TensorBuf. For example, if
-    /// one tensor sets the last dimension of @p pads to 2, and another tensor
-    /// sets the last dimension of @p pads to 3, then the corresponding ldim
-    /// will be the minimum multiple of 2x3=6 that is larger than or equal to
-    /// the corresponding dimension of @p offs + @p shape.
-    /// @param exported Whether the tensor is exported to other processes. This
-    /// should be set to true if the tensor is used as an input or output of a
-    /// remote process.
-    /// @param imported_rank The rank of the process that exports the tensor.
-    /// If @p imported_rank is set to a non-negative value, the tensor will be
-    /// considered as a remote tensor, hence no memory will be allocated for it
-    /// on the local. @p imported_rank should be set to -1 if the tensor resides
-    /// on the local.
-    /// @param name Name of the tensor.
-    /// @return Pointer to a tensor object.
-    ///
-    Tensor *tensor(const Dims &shape, const TensorType &ttype,
-                   TensorBuf *buf = nullptr, const Dims &ldims = {},
-                   const Dims &offs = {}, const Dims &pads = {},
-                   const std::vector<Tensor *> &deps = {},
-                   bool exported = false, int imported_rank = -1,
-                   const std::string &name = "tensor");
-
-    Tensor *reshape(Tensor *input, const Dims &shape, bool allowzero = false,
-                    Tensor *output = nullptr,
-                    const std::string &name = "reshape");
-    Tensor *reshape(Tensor *input, const std::initializer_list<DimType> &shape,
-                    bool allowzero = false, Tensor *output = nullptr,
-                    const std::string &name = "reshape");
-    // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
-    // inferred from the `input`. If one dimension of `shape` is 0, by default
-    // (`allowzero` is false), that dimension is unchanged from the
-    // corresponding one of `input`. If `allowzero` is true, that dimension is
-    // set to 0, which means that the reshaped tensor is an empty tensor, i.e.,
-    // `input` should also be an empty tensor. If `allowzero` is true, `shape`
-    // should not include both 0 and -1 at the same time. If `shape` is an empty
-    // vector, `input` will be converted to a scalar.
-    Tensor *reshape(Tensor *input, const std::vector<DimType> &shape,
-                    bool allowzero = false, Tensor *output = nullptr,
-                    const std::string &name = "reshape");
-    // Returns an identical tensor of `input` with execution dependencies
-    // `deps`.
-    Tensor *identity(Tensor *input, const std::vector<Tensor *> &deps = {},
-                     const std::string &name = "identity");
-
-    // Shard `input` along `axis` into `dim_per_shard`-dimensional shards.
-    std::vector<Tensor *> sharding(Tensor *input, DimType axis,
-                                   DimType dim_per_shard,
-                                   const std::string &name = "sharding");
-    // Performs reduction along the `axis` of the `input` tensor and stores the
-    // result in `output`.
-    // Currently, only reduction along the last dimension is supported.
-    template <typename ReduceOpType>
-    Tensor *reduce(Tensor *input, int axis, bool keepdims = true,
-                   Tensor *output = nullptr,
-                   const std::string &name = "reduce");
-    Tensor *reduce_sum(Tensor *input, int axis, bool keepdims = true,
-                       Tensor *output = nullptr,
-                       const std::string &name = "reduce_sum");
-    Tensor *reduce_mean(Tensor *input, int axis, bool keepdims = true,
-                        Tensor *output = nullptr,
-                        const std::string &name = "reduce_mean");
-    Tensor *reduce_max(Tensor *input, int axis, bool keepdims = true,
-                       Tensor *output = nullptr,
-                       const std::string &name = "reduce_max");
-    // Applies layer normalization to the `input` tensor and returns the
-    // normalized tensor as `output`.
-    Tensor *layernorm(Tensor *input, Tensor *output = nullptr,
-                      const std::string &name = "layernorm");
-    // Transposes the `input` tensor according to the given `perm` permutation.
-    // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
-    // dimensions of the input tensor. Currently, only 4D tensors are supported.
-    Tensor *transpose(Tensor *input, Dims perm, Tensor *output = nullptr,
-                      const std::string &name = "transpose");
-    // Performs matrix multiplication between the `input` tensor and another
-    // `other` tensor, storing the result in `output`.
-    Tensor *matmul(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                   DimType splitk = 1, bool trans_input = false,
-                   bool trans_other = false, const std::string &name = "matmul",
-                   int gran_lev = -1);
-    // Implements the 'im2col' method for 2D convolution layers, which takes an
-    // `input` tensor and reshapes it to a 2D matrix by extracting image patches
-    // from the input tensor based on the provided parameters.
-    Tensor *im2col(Tensor *input, int kernel_height, int kernel_width,
-                   int stride_height, int stride_width, int pad_height,
-                   int pad_width, int dilation_height, int dilation_width,
-                   Tensor *output = nullptr,
-                   const std::string &name = "im2col");
-    // Applies max-pooling on the `input` tensor using `kernel_size` and
-    // `stride`, reducing its spatial size. The output shape is calculated based
-    // on the input tensor's shape and the stride value as follows: {is[0],
-    // (is[1] + stride - 1) / stride, (is[2] + stride - 1) / stride, is[3]},
-    // where 'is' represents the input tensor's shape.
-    Tensor *max_pool(Tensor *input, DimType kernel_size, DimType stride,
-                     Tensor *output = nullptr,
-                     const std::string &name = "max_pool");
-    // Multiplies the `input` tensor by a scalar `val`, element-wise.
-    Tensor *scale(Tensor *input, float val, Tensor *output = nullptr,
-                  const std::string &name = "scale");
-    //
-    template <typename MathOpType>
-    Tensor *math(Tensor *input, Tensor *output = nullptr,
-                 const std::string &name = "math");
-    // Calculates the exponential of the `input` tensor, element-wise.
-    Tensor *exp(Tensor *input, Tensor *output = nullptr,
-                const std::string &name = "exp");
-    // Calculates the square root of the `input` tensor, element-wise.
-    Tensor *sqrt(Tensor *input, Tensor *output = nullptr,
-                 const std::string &name = "sqrt");
-    // Calculates the reverse square root of the `input` tensor, element-wise.
-    Tensor *rsqrt(Tensor *input, Tensor *output = nullptr,
-                  const std::string &name = "rsqrt");
-    // ReLU activation
-    Tensor *relu(Tensor *input, Tensor *output = nullptr,
-                 const std::string &name = "relu");
-    // Copy the `input` tensor to `output` tensor
-    Tensor *copy(Tensor *input, Tensor *output = nullptr,
-                 const std::string &name = "copy");
-    // Applies the Gaussian Error Linear Unit (GELU) activation function to the
-    // `input` tensor, element-wise. GELU is a smooth approximation of the
-    // rectifier function and is widely used in deep learning models.
-    Tensor *gelu(Tensor *input, Tensor *output = nullptr,
-                 const std::string &name = "gelu");
-    // Sigmoid activation
-    Tensor *sigmoid(Tensor *input, Tensor *output = nullptr,
-                    const std::string &name = "sigmoid");
-    // Performs rotary position embedding (RoPE) on the `input` tensor
-    Tensor *rope(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                 const std::string &name = "rope");
-    // Template for broadcated arithmetic operators.
-    template <typename ArithmeticOpType>
-    Tensor *arithmetic(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                       const std::string &name = "arithmeitc");
-    // Performs an element-wise addition operator between the `input` tensor
-    // and the `other` tensor
-    Tensor *add(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                const std::string &name = "add");
-    // Performs an element-wise subtraction operator between the `input` tensor
-    // and the `other` tensor
-    Tensor *sub(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                const std::string &name = "sub");
-    // Performs an element-wise multiplication operator between the `input`
-    // tensor and the `other` tensor,
-    Tensor *mul(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                const std::string &name = "mul");
-    // Performs an element-wise division operator between the `input`
-    // tensor and the `other` tensor,
-    Tensor *div(Tensor *input, Tensor *other, Tensor *output = nullptr,
-                const std::string &name = "div");
-    /// Sends a tensor to a destination rank (@p dst_rank). Multiple tensors can
-    /// be sent to the same rank,so an identifier `id` is required to
-    /// distinguish the tensor. Each 'send' operator must have a corresponding
-    /// 'recv' operator that have the same id in another rank's model.
-    ///
-    /// @param input
-    /// @param id
-    /// @param dst_rank Rank of the GPU to send to.
-    /// @param bytes
-    /// @param name
-    /// @return
-    Tensor *send(Tensor *input, int sid, int dst_rank, std::size_t bytes = 0,
-                 const std::string &name = "send");
-    // Blocks the execution until the corresponding 'send' operator with the
-    // specified `id` is completed.
-    Tensor *send_done(Tensor *input, int sid, int dst_rank,
-                      const std::string &name = "send_done");
-    // Receives a tensor from a source rank (@p src_rank), identified by the
-    // `id` parameter. Blocks the execution until the corresponding 'recv'
-    // operator is completed.
-    Tensor *recv(int sid, int src_rank, std::size_t bytes = 0,
-                 Tensor *output = nullptr, const std::string &name = "recv");
-    //
-    Tensor *put_packet(Tensor *input, Tensor *local_tmp_buf, Tensor *recv_buf,
-                       int id, int rank, int dst_rank, size_t dst_offset,
-                       int flag, const std::string &name = "put_packet");
-    // Performs an all-reduce operator across all ranks, aggregating the input
-    // tensors. Takes the `input` tensor, the current GPU's rank, and the
-    // total number of ranks `rank_num`.
-    Tensor *all_reduce(Tensor *input, int rank, int rank_num,
-                       Tensor *output = nullptr,
-                       const std::string &name = "all_reduce");
-    // Performs an all-gather operator across all ranks, aggregating the input
-    // tensors. Takes the `input` tensor, the current GPU's rank, and the
-    // total number of ranks `rank_num`. Returns a vector of tensors, each
-    // containing the aggregated data from all ranks.
-    std::vector<Tensor *> all_gather(Tensor *input, int rank, int rank_num,
-                                     const std::vector<Tensor *> &output = {},
-                                     const std::string &name = "all_gather");
-    /// Embedding layer.
-    Tensor *embedding(Tensor *input, Tensor *weight, Tensor *output = nullptr,
-                      const std::string &name = "embedding");
-    /// Tensor type casting.
-    Tensor *cast(Tensor *input, const TensorType &ttype,
-                 Tensor *output = nullptr, const std::string &name = "cast");
-
-    // sync across multi devices
-    Tensor *device_sync(Tensor *input, int npeers,
-                        const std::string &name = "device_sync");
-
-    // local reduce scatter
-    Tensor *local_reduce_scatter(
-        Tensor *input, int gpu_id, int ngpus_per_node,
-        const std::string &name = "local_reduce_scatter");
-
-    // local all gather
-    Tensor *local_all_gather(Tensor *input, int gpu_id, int ngpus_per_node,
-                             int axis = 0,
-                             const std::string &name = "local_all_gather");
-    // read data from remote and reduce to current buffer
-    Tensor *read_and_reduce(Tensor *input, int sid, int npeers, size_t offset,
-                            size_t bytes,
-                            const std::string &name = "read_and_reduce");
-    // gather from peers
-    Tensor *gather_from_peers(Tensor *input, Tensor *tile, int sid, int npeers,
-                              size_t chunkBytes,
-                              const std::string &name = "gather_from_peers");
-
-    Tensor *local_all_reduce(Tensor *input, int gpu_id, int gpu_num,
-                             const std::string &name = "local_all_reduce");
-    Tensor *local_all_reduce_packet(
-        Tensor *input, int gpu_id, int gpu_num,
-        const std::string &name = "local_all_reduce_packet");
-
-    Tensor *reduce_and_write_packet(
-        Tensor *input, Tensor *scratch, Tensor *output,
-        const std::vector<Tensor *> &remote_peer_bufs, int id, int rank,
-        int npeers, size_t elems_per_rank, size_t scratch_offset,
-        size_t remote_dst_offset, int flag,
-        const std::string &name = "reduce_and_write_packet");
-    Tensor *get_packet(Tensor *input, Tensor *output, size_t src_offset,
-                       size_t dst_offset, size_t npackets, int flag,
-                       const std::string &name = "get_packet");
-
-   protected:
-    class Impl;
-    friend class DefaultScheduler;
-
-   private:
-    std::unique_ptr<Impl> impl;
-};
-
-}  // namespace ark
-
-#endif  // ARK_MODEL_H
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
new file mode 100644
index 000000000..bc9fa63b1
--- /dev/null
+++ b/ark/include/ark/model.hpp
@@ -0,0 +1,315 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_HPP
+#define ARK_MODEL_HPP
+
+#include <string>
+#include <vector>
+
+#include "dims.hpp"
+#include "model_graph.hpp"
+#include "model_ref.hpp"
+
+namespace ark {
+
+class ModelDataT;
+using ModelDataType = std::shared_ptr<ModelDataT>;
+
+extern const ModelDataType NONE;
+extern const ModelDataType FP32;
+extern const ModelDataType FP16;
+extern const ModelDataType BF16;
+extern const ModelDataType INT32;
+extern const ModelDataType UINT32;
+extern const ModelDataType INT8;
+extern const ModelDataType UINT8;
+extern const ModelDataType BYTE;
+
+class Model : public ModelGraph {
+   private:
+    int rank_;
+
+   public:
+    Model(int rank = 0) : rank_(rank) {}
+    Model(const Model &other) : ModelGraph(other), rank_(other.rank()) {}
+    ~Model() {}
+
+    Model &operator=(const Model &other) = default;
+
+    int rank() const { return rank_; }
+
+    Model compress() const;
+
+    /// Returns a tensor object.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Leading dimensions (ldim) of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer (@ref TensorBuf).
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p shape.
+    /// @param pads If a dimension of @p pads is set to larger than 1, the
+    /// corresponding ldim will be set to the minimum multiple of @p pads that
+    /// is larger than or equal to the previous ldim. Padding is accumulated
+    /// across all tensors that share the same @ref TensorBuf. For example, if
+    /// one tensor sets the last dimension of @p pads to 2, and another tensor
+    /// sets the last dimension of @p pads to 3, then the corresponding ldim
+    /// will be the minimum multiple of 2x3=6 that is larger than or equal to
+    /// the corresponding dimension of @p offsets + @p shape.
+    /// @param exported Whether the tensor is exported to other processes. This
+    /// should be set to true if the tensor is used as an input or output of a
+    /// remote process.
+    /// @param imported_rank The rank of the process that exports the tensor.
+    /// If @p imported_rank is set to a non-negative value, the tensor will be
+    /// considered as a remote tensor, hence no memory will be allocated for it
+    /// on the local. @p imported_rank should be set to -1 if the tensor resides
+    /// on the local.
+    /// @param name Name of the tensor.
+    /// @return Pointer to a tensor object.
+    ///
+    ModelTensorRef tensor(const Dims &shape, ModelDataType data_type,
+                          const Dims &strides = {}, const Dims &offsets = {},
+                          const Dims &pads = {}, bool exported = false,
+                          int imported_rank = -1, const std::string &name = "");
+
+    ModelTensorRef refer(ModelTensorRef input, const Dims &shape = {},
+                         const Dims &strides = {}, const Dims &offsets = {},
+                         const Dims &pads = {}, const std::string &name = "");
+
+    ModelTensorRef reshape(ModelTensorRef input, const Dims &shape,
+                           bool allowzero = false,
+                           ModelTensorRef output = nullptr,
+                           const std::string &name = "");
+    ModelTensorRef reshape(ModelTensorRef input,
+                           const std::initializer_list<DimType> &shape,
+                           bool allowzero = false,
+                           ModelTensorRef output = nullptr,
+                           const std::string &name = "");
+    // Reshape `input` to `shape`. If one dimension of `shape` is -1, it will be
+    // inferred from the `input`. If one dimension of `shape` is 0, by default
+    // (`allowzero` is false), that dimension is unchanged from the
+    // corresponding one of `input`. If `allowzero` is true, that dimension is
+    // set to 0, which means that the reshaped tensor is an empty tensor, i.e.,
+    // `input` should also be an empty tensor. If `allowzero` is true, `shape`
+    // should not include both 0 and -1 at the same time. If `shape` is an empty
+    // vector, `input` will be converted to a scalar.
+    ModelTensorRef reshape(ModelTensorRef input,
+                           const std::vector<DimType> &shape,
+                           bool allowzero = false,
+                           ModelTensorRef output = nullptr,
+                           const std::string &name = "");
+    // Returns an identical tensor of `input` with execution dependencies
+    // `deps`.
+    ModelTensorRef identity(ModelTensorRef input,
+                            const std::vector<ModelTensorRef> &deps = {},
+                            const std::string &name = "");
+
+    // Shard `input` along `axis` into `dim_per_shard`-dimensional shards.
+    std::vector<ModelTensorRef> sharding(ModelTensorRef input, DimType axis,
+                                         DimType dim_per_shard,
+                                         const std::string &name = "");
+    // Performs reduction along the `axis` of the `input` tensor and stores the
+    // result in `output`.
+    // Currently, only reduction along the last dimension is supported.
+    template <typename ReduceOpType>
+    ModelTensorRef reduce(ModelTensorRef input, int axis, bool keepdims = true,
+                          ModelTensorRef output = nullptr,
+                          const std::string &name = "");
+    ModelTensorRef reduce_sum(ModelTensorRef input, int axis,
+                              bool keepdims = true,
+                              ModelTensorRef output = nullptr,
+                              const std::string &name = "");
+    ModelTensorRef reduce_mean(ModelTensorRef input, int axis,
+                               bool keepdims = true,
+                               ModelTensorRef output = nullptr,
+                               const std::string &name = "");
+    ModelTensorRef reduce_max(ModelTensorRef input, int axis,
+                              bool keepdims = true,
+                              ModelTensorRef output = nullptr,
+                              const std::string &name = "");
+    // Applies layer normalization to the `input` tensor and returns the
+    // normalized tensor as `output`.
+    ModelTensorRef layernorm(ModelTensorRef input,
+                             ModelTensorRef output = nullptr,
+                             const std::string &name = "");
+    // Transposes the `input` tensor according to the given `perm` permutation.
+    // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
+    // dimensions of the input tensor. Currently, only 4D tensors are supported.
+    ModelTensorRef transpose(ModelTensorRef input, Dims perm,
+                             ModelTensorRef output = nullptr,
+                             const std::string &name = "");
+    // Performs matrix multiplication between the `input` tensor and another
+    // `other` tensor, storing the result in `output`.
+    ModelTensorRef matmul(ModelTensorRef input, ModelTensorRef other,
+                          ModelTensorRef output = nullptr,
+                          bool trans_input = false, bool trans_other = false,
+                          const std::string &name = "");
+    // Implements the 'im2col' method for 2D convolution layers, which takes an
+    // `input` tensor and reshapes it to a 2D matrix by extracting image patches
+    // from the input tensor based on the provided parameters.
+    ModelTensorRef im2col(ModelTensorRef input, int kernel_height,
+                          int kernel_width, int stride_height, int stride_width,
+                          int pad_height, int pad_width, int dilation_height,
+                          int dilation_width, ModelTensorRef output = nullptr,
+                          const std::string &name = "");
+    // Applies max-pooling on the `input` tensor using `kernel_size` and
+    // `stride`, reducing its spatial size. The output shape is calculated based
+    // on the input tensor's shape and the stride value as follows: {is[0],
+    // (is[1] + stride - 1) / stride, (is[2] + stride - 1) / stride, is[3]},
+    // where 'is' represents the input tensor's shape.
+    ModelTensorRef max_pool(ModelTensorRef input, DimType kernel_size,
+                            DimType stride, ModelTensorRef output = nullptr,
+                            const std::string &name = "");
+    // Multiplies the `input` tensor by a scalar `val`, element-wise.
+    ModelTensorRef scale(ModelTensorRef input, float val,
+                         ModelTensorRef output = nullptr,
+                         const std::string &name = "");
+    //
+    template <typename MathOpType>
+    ModelTensorRef math(ModelTensorRef input, ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    // Calculates the exponential of the `input` tensor, element-wise.
+    ModelTensorRef exp(ModelTensorRef input, ModelTensorRef output = nullptr,
+                       const std::string &name = "");
+    // Calculates the square root of the `input` tensor, element-wise.
+    ModelTensorRef sqrt(ModelTensorRef input, ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    // Calculates the reverse square root of the `input` tensor, element-wise.
+    ModelTensorRef rsqrt(ModelTensorRef input, ModelTensorRef output = nullptr,
+                         const std::string &name = "");
+    // ReLU activation
+    ModelTensorRef relu(ModelTensorRef input, ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    // Copy the `input` tensor to `output` tensor
+    ModelTensorRef copy(ModelTensorRef input, ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    // Applies the Gaussian Error Linear Unit (GELU) activation function to the
+    // `input` tensor, element-wise. GELU is a smooth approximation of the
+    // rectifier function and is widely used in deep learning models.
+    ModelTensorRef gelu(ModelTensorRef input, ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    // Sigmoid activation
+    ModelTensorRef sigmoid(ModelTensorRef input,
+                           ModelTensorRef output = nullptr,
+                           const std::string &name = "");
+    // Performs rotary position embedding (RoPE) on the `input` tensor
+    ModelTensorRef rope(ModelTensorRef input, ModelTensorRef other,
+                        ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+
+    // Performs an element-wise addition operator between the `input` tensor
+    // and the `other` tensor
+    ModelTensorRef add(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output = nullptr,
+                       const std::string &name = "");
+    // Performs an element-wise subtraction operator between the `input` tensor
+    // and the `other` tensor
+    ModelTensorRef sub(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output = nullptr,
+                       const std::string &name = "");
+    // Performs an element-wise multiplication operator between the `input`
+    // tensor and the `other` tensor,
+    ModelTensorRef mul(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output = nullptr,
+                       const std::string &name = "");
+    // Performs an element-wise division operator between the `input`
+    // tensor and the `other` tensor,
+    ModelTensorRef div(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output = nullptr,
+                       const std::string &name = "");
+    /// Sends a tensor to a destination rank (@p dst_rank). Multiple tensors can
+    /// be sent to the same rank,so an identifier `id` is required to
+    /// distinguish the tensor. Each 'send' operator must have a corresponding
+    /// 'recv' operator that have the same id in another rank's model.
+    ///
+    /// @param input
+    /// @param id
+    /// @param dst_rank Rank of the GPU to send to.
+    /// @param bytes
+    /// @param name
+    /// @return
+    ModelTensorRef send(ModelTensorRef input, int sid, int dst_rank,
+                        DimType bytes = 0, const std::string &name = "");
+    // Blocks the execution until the corresponding 'send' operator with the
+    // specified `id` is completed.
+    ModelTensorRef send_done(ModelTensorRef input, int sid, int dst_rank,
+                             const std::string &name = "");
+    // Receives a tensor from a source rank (@p src_rank), identified by the
+    // `id` parameter. Blocks the execution until the corresponding 'recv'
+    // operator is completed.
+    ModelTensorRef recv(int sid, int src_rank, DimType bytes = 0,
+                        ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+    //
+    ModelTensorRef put_packet(ModelTensorRef input,
+                              ModelTensorRef local_tmp_buf,
+                              ModelTensorRef recv_buf, int id, int rank,
+                              int dst_rank, size_t dst_offset, int flag,
+                              const std::string &name = "");
+    // Performs an all-reduce operator across all ranks, aggregating the input
+    // tensors. Takes the `input` tensor, the current GPU's rank, and the
+    // total number of ranks `rank_num`.
+    ModelTensorRef all_reduce(ModelTensorRef input, int rank, int rank_num,
+                              ModelTensorRef output = nullptr,
+                              const std::string &name = "");
+    // Performs an all-gather operator across all ranks, aggregating the input
+    // tensors. Takes the `input` tensor, the current GPU's rank, and the
+    // total number of ranks `rank_num`. Returns a vector of tensors, each
+    // containing the aggregated data from all ranks.
+    std::vector<ModelTensorRef> all_gather(
+        ModelTensorRef input, int rank, int rank_num,
+        const std::vector<ModelTensorRef> &output = {},
+        const std::string &name = "");
+    /// Embedding layer.
+    ModelTensorRef embedding(ModelTensorRef input, ModelTensorRef weight,
+                             ModelTensorRef output = nullptr,
+                             const std::string &name = "");
+    /// Tensor type casting.
+    ModelTensorRef cast(ModelTensorRef input, ModelDataType data_type,
+                        ModelTensorRef output = nullptr,
+                        const std::string &name = "");
+
+    // sync across multi devices
+    ModelTensorRef device_sync(ModelTensorRef input, int npeers,
+                               const std::string &name = "");
+
+    // local reduce scatter
+    ModelTensorRef local_reduce_scatter(ModelTensorRef input, int gpu_id,
+                                        int ngpus_per_node,
+                                        const std::string &name = "");
+
+    // local all gather
+    ModelTensorRef local_all_gather(ModelTensorRef input, int gpu_id,
+                                    int ngpus_per_node, int axis = 0,
+                                    const std::string &name = "");
+    // read data from remote and reduce to current buffer
+    ModelTensorRef read_and_reduce(ModelTensorRef input, int sid, int npeers,
+                                   size_t offset, size_t bytes,
+                                   const std::string &name = "");
+    // gather from peers
+    ModelTensorRef gather_from_peers(ModelTensorRef input, ModelTensorRef tile,
+                                     int sid, int npeers, size_t chunkBytes,
+                                     const std::string &name = "");
+
+    ModelTensorRef local_all_reduce(ModelTensorRef input, int gpu_id,
+                                    int gpu_num, const std::string &name = "");
+    ModelTensorRef local_all_reduce_packet(ModelTensorRef input, int gpu_id,
+                                           int gpu_num,
+                                           const std::string &name = "");
+
+    ModelTensorRef reduce_and_write_packet(
+        ModelTensorRef input, ModelTensorRef scratch, ModelTensorRef output,
+        const std::vector<ModelTensorRef> &remote_peer_bufs, int id, int rank,
+        int npeers, size_t elems_per_rank, size_t scratch_offset,
+        size_t remote_dst_offset, int flag, const std::string &name = "");
+    ModelTensorRef get_packet(ModelTensorRef input, ModelTensorRef output,
+                              size_t src_offset, size_t dst_offset,
+                              size_t npackets, int flag,
+                              const std::string &name = "");
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_HPP
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
new file mode 100644
index 000000000..21ee4c328
--- /dev/null
+++ b/ark/include/ark/model_graph.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_GRAPH_HPP
+#define ARK_MODEL_GRAPH_HPP
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "model_ref.hpp"
+
+namespace ark {
+
+class ModelGraph {
+   public:
+    ModelGraph();
+
+    ModelGraph(const ModelGraph &other);
+
+    ~ModelGraph();
+
+    ModelGraph &operator=(const ModelGraph &other);
+
+    /// Break a @ref ModelNode into two @ref ModelNode.
+    ///
+    /// The original node will have the first @p op_idx ops, and the new node
+    /// will have the rest.
+    ///
+    /// @param node The @ref ModelNode to break.
+    /// @param op_idx The index of the first op in the new @ref ModelNode.
+    /// @return The new @ref ModelNode.
+    ModelNodeRef break_node(ModelNodeRef node, size_t op_idx);
+
+    void compress_nodes();
+
+    bool verify() const;
+
+    std::string serialize(int indent = -1) const;
+
+    /// Get the list of @ref ModelNode in the graph.
+    std::vector<ModelNodeRef> nodes() const;
+
+   protected:
+    friend class Model;
+
+    class Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_GRAPH_HPP
diff --git a/ark/include/ark/model_ref.hpp b/ark/include/ark/model_ref.hpp
new file mode 100644
index 000000000..594a95772
--- /dev/null
+++ b/ark/include/ark/model_ref.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_REF_HPP
+#define ARK_MODEL_REF_HPP
+
+#include <memory>
+
+namespace ark {
+
+class ModelOp;
+using ModelOpRef = std::shared_ptr<ModelOp>;
+
+class ModelBuffer;
+using ModelBufferRef = std::shared_ptr<ModelBuffer>;
+
+class ModelTensor;
+using ModelTensorRef = std::shared_ptr<ModelTensor>;
+
+class ModelNode;
+using ModelNodeRef = std::shared_ptr<ModelNode>;
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_REF_HPP
diff --git a/ark/include/ark/random.hpp b/ark/include/ark/random.hpp
new file mode 100644
index 000000000..2b1a6d8a8
--- /dev/null
+++ b/ark/include/ark/random.hpp
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_RANDOM_HPP
+#define ARK_RANDOM_HPP
+
+#include <string>
+
+namespace ark {
+
+// set random seed
+void srand(int seed = -1);
+
+// get random number
+int rand();
+
+/// Generate a random value.
+template <typename T>
+T rand(float min_val, float max_val) {
+    int mid = RAND_MAX / 2;
+    return T((ark::rand() - mid) / (float)mid * (max_val - min_val) + min_val);
+}
+
+}  // namespace ark
+
+#endif  // ARK_RANDOM_HPP
diff --git a/ark/schedule/schedule.h b/ark/include/ark/schedule.hpp
similarity index 92%
rename from ark/schedule/schedule.h
rename to ark/include/ark/schedule.hpp
index 76465a3e2..0f809e54c 100644
--- a/ark/schedule/schedule.h
+++ b/ark/include/ark/schedule.hpp
@@ -1,14 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_SCHEDULE_H_
-#define ARK_SCHEDULE_H_
+#ifndef ARK_SCHEDULE_HPP
+#define ARK_SCHEDULE_HPP
 
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "range.h"
+#include "range.hpp"
 
 namespace ark {
 
@@ -68,4 +68,4 @@ class Schedule {
 
 }  // namespace ark
 
-#endif  // ARK_SCHEDULE_H_
+#endif  // ARK_SCHEDULE_HPP
diff --git a/ark/include/ark/version.hpp b/ark/include/ark/version.hpp
new file mode 100644
index 000000000..e8ba54583
--- /dev/null
+++ b/ark/include/ark/version.hpp
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_VERSION_HPP
+#define ARK_VERSION_HPP
+
+#include <string>
+
+#define ARK_MAJOR 0
+#define ARK_MINOR 5
+#define ARK_PATCH 0
+#define ARK_VERSION (ARK_MAJOR * 10000 + ARK_MINOR * 100 + ARK_PATCH)
+
+namespace ark {
+
+/// Return a version string.
+std::string version();
+
+}  // namespace ark
+
+#endif  // ARK_VERSION_HPP
diff --git a/ark/include/kernels/common/arch.h b/ark/include/kernels/common/arch.h
index e268ad78c..7eff95c7b 100644
--- a/ark/include/kernels/common/arch.h
+++ b/ark/include/kernels/common/arch.h
@@ -32,13 +32,13 @@ DEVICE int warp_id() {
 #if defined(ARK_TARGET_CUDA_ARCH)
 #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func)    \
     template <typename... Args>                        \
-    inline auto alias(Args &&... args) {               \
+    inline auto alias(Args &&...args) {                \
         return cuda_func(std::forward<Args>(args)...); \
     }
 #elif defined(ARK_TARGET_ROCM_ARCH)
 #define ARCH_ALIAS_FUNC(alias, cuda_func, hip_func)   \
     template <typename... Args>                       \
-    inline auto alias(Args &&... args) {              \
+    inline auto alias(Args &&...args) {               \
         return hip_func(std::forward<Args>(args)...); \
     }
 #endif
diff --git a/ark/include/kernels/common/static_math.h b/ark/include/kernels/common/static_math.h
index 06a4552cf..b7093b8b7 100644
--- a/ark/include/kernels/common/static_math.h
+++ b/ark/include/kernels/common/static_math.h
@@ -150,6 +150,28 @@ static DEVICE long long int gm(long long int x) {
     return math::div<Divisor>(x) * Divisor;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+
+template <size_t Rhs>
+DEVICE bool geq(size_t x) {
+    return x >= Rhs;
+}
+
+template <>
+DEVICE bool geq<0>(size_t x) {
+    return true;
+}
+
+template <size_t Rhs>
+DEVICE bool le(size_t x) {
+    return x < Rhs;
+}
+
+template <>
+DEVICE bool le<0>(size_t x) {
+    return false;
+}
+
 }  // namespace math
 }  // namespace ark
 
diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index 0d07f8ed2..85f7639c9 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -86,7 +86,7 @@ DEVICE void sync_warps() {
         __syncwarp();
     } else if constexpr (NumWarps == 2) {
         static_assert(
-            ARK_THREADS_PER_BLOCK <= 512,
+            ARK_WARPS_PER_BLOCK <= 16,
             "2-warp barrier is not supported for block sizes larger than 512");
         asm volatile("barrier.sync %0, 64;" ::"r"((threadIdx.x >> 6) + 8));
     } else if constexpr (NumWarps == 4) {
diff --git a/ark/include/kernels/common/vector_type.h b/ark/include/kernels/common/vector_type.h
index e86a50145..379c79db1 100644
--- a/ark/include/kernels/common/vector_type.h
+++ b/ark/include/kernels/common/vector_type.h
@@ -77,8 +77,8 @@ struct IntrinsicCompute1Exists {
     template <typename>
     static auto test(...) -> std::false_type;
 
-    static constexpr bool value = decltype(
-        test<IntrinsicType>(type::Constant<InputVtype>::zero()))::value;
+    static constexpr bool value = decltype(test<IntrinsicType>(
+        type::Constant<InputVtype>::zero()))::value;
 };
 
 template <typename IntrinsicType, typename InputVtype>
@@ -90,9 +90,9 @@ struct IntrinsicCompute2Exists {
     template <typename>
     static auto test(...) -> std::false_type;
 
-    static constexpr bool value = decltype(
-        test<IntrinsicType>(type::Constant<InputVtype>::zero(),
-                            type::Constant<InputVtype>::zero()))::value;
+    static constexpr bool value = decltype(test<IntrinsicType>(
+        type::Constant<InputVtype>::zero(),
+        type::Constant<InputVtype>::zero()))::value;
 };
 
 template <typename IntrinsicType, typename InputType, int CurrentVal = 256>
@@ -197,11 +197,10 @@ struct DefaultNelemPerThread {
                                                  : UnitOutDims::W;
 
     static const int value =
-        (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0)
-            ? 8
-            : (ConsecutiveDimLen % 4 == 0)
-                  ? 4
-                  : (ConsecutiveDimLen % 2 == 0) ? 2 : 1;
+        (sizeof(OutDataType) <= 2 && ConsecutiveDimLen % 8 == 0) ? 8
+        : (ConsecutiveDimLen % 4 == 0)                           ? 4
+        : (ConsecutiveDimLen % 2 == 0)                           ? 2
+                                                                 : 1;
 };
 
 }  // namespace ark
diff --git a/ark/include/kernels/gemm_ck.h b/ark/include/kernels/gemm_ck.h
index 4054f2d37..05a6a23dc 100644
--- a/ark/include/kernels/gemm_ck.h
+++ b/ark/include/kernels/gemm_ck.h
@@ -90,13 +90,15 @@ struct CkGemmConfig<fp32, fp32, fp32, fp32, LayoutA, LayoutB, NumThreads,
         (!IsColA || AK1 == 4 || Is_256x256x128 || Is_128x128x128 ||
          Is_128x128x64)
             ? 4
-            : Is_256x64x128 ? 16 : 8;
+        : Is_256x64x128 ? 16
+                        : 8;
 
     static constexpr auto B_Lengths_K0 =
         (IsColB || BK1 == 4 || Is_256x128x256 || Is_128x128x128 ||
          Is_128x64x128)
             ? 4
-            : Is_256x128x64 ? 16 : 8;
+        : Is_256x128x64 ? 16
+                        : 8;
 
     using ImplXdl = ck::tensor_operation::device::DeviceGemmXdl<
         F32, F32, F32, F32, LayoutA, LayoutB, Row, PassThrough, PassThrough,
@@ -159,9 +161,9 @@ struct CkGemmConfig<fp16, fp16, fp16, fp32, LayoutA, LayoutB, NumThreads,
     static constexpr auto LogMNXdlPerWave = math::log2_up<MNXdlPerWave>::value;
     static constexpr auto MXdlPerWave =
         (TileSizeM == 16) ? 1
-                          : (TileSizeM < TileSizeN)
-                                ? 1 << (LogMNXdlPerWave / 2)
-                                : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2);
+        : (TileSizeM < TileSizeN)
+            ? 1 << (LogMNXdlPerWave / 2)
+            : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2);
     static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave;
 
     static constexpr bool Is_256x256x128 =
@@ -197,13 +199,15 @@ struct CkGemmConfig<fp16, fp16, fp16, fp32, LayoutA, LayoutB, NumThreads,
         (!IsColA || AK1 == 8 || Is_256x256x128 || Is_128x128x128 ||
          Is_128x128x64)
             ? 4
-            : Is_256x64x128 ? 16 : 8;
+        : Is_256x64x128 ? 16
+                        : 8;
 
     static constexpr auto B_Lengths_K0 =
         (IsColB || BK1 == 8 || Is_256x128x256 || Is_128x128x128 ||
          Is_128x64x128)
             ? 4
-            : Is_256x128x64 ? 16 : 8;
+        : Is_256x128x64 ? 16
+                        : 8;
 
     using ImplXdl = ck::tensor_operation::device::DeviceGemmXdl<
         F16, F16, F16, F32, LayoutA, LayoutB, Row, PassThrough, PassThrough,
@@ -212,17 +216,19 @@ struct CkGemmConfig<fp16, fp16, fp16, fp32, LayoutA, LayoutB, NumThreads,
         S<4, Is_16 ? 16 : (NumThreads / 4), 1>,
         typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
         typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
-        (IsColA ? 1 : 2), (!IsColA ? 8 : Is_128x128x64 ? 4 : MXdlPerWave), 8,
-        true, S<4, NumThreads / 4, 1>,
+        (IsColA ? 1 : 2),
+        (!IsColA         ? 8
+         : Is_128x128x64 ? 4
+                         : MXdlPerWave),
+        8, true, S<4, NumThreads / 4, 1>,
         typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
         typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
         (IsColB ? 2 : 1),
-        (IsColB ? 8
-                : Is_128x32x256
-                      ? 8
-                      : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128)
-                            ? 4
-                            : (Is_128x32x64 || Is_64x32x32) ? 2 : NXdlPerWave),
+        (IsColB                                               ? 8
+         : Is_128x32x256                                      ? 8
+         : (Is_128x32x128 || Is_128x64x128 || Is_128x128x128) ? 4
+         : (Is_128x32x64 || Is_64x32x32)                      ? 2
+                                                              : NXdlPerWave),
         8, true, 7, 1, 1, LoopSched, PipelineVer>;
 
     using ImplXdlCShuffle =
@@ -234,16 +240,17 @@ struct CkGemmConfig<fp16, fp16, fp16, fp32, LayoutA, LayoutB, NumThreads,
             typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
             typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
             (IsColA ? 1 : 2),
-            (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1,
-            (AK1 == 8), S<B_Lengths_K0, (NumThreads / B_Lengths_K0), 1>,
+            (!IsColA                       ? 8
+             : (AK1 == 2 || Is_128x128x64) ? 4
+                                           : MXdlPerWave),
+            AK1, (AK1 == 8), S<B_Lengths_K0, (NumThreads / B_Lengths_K0), 1>,
             typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
             typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
             (IsColB ? 2 : 1),
             (IsColB ? 8
-                    : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 ||
-                       Is_128x64x128)
-                          ? 4
-                          : NXdlPerWave),
+             : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128)
+                 ? 4
+                 : NXdlPerWave),
             BK1, (BK1 == 8), 1, 1,
             S<1,
               (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 ||
@@ -255,16 +262,17 @@ struct CkGemmConfig<fp16, fp16, fp16, fp32, LayoutA, LayoutB, NumThreads,
             8>;
 
 #if (DEBUG_CK != 0)
-    PrintDeviceGemmXdlCShuffle<
-        NumThreads, TileSizeM, TileSizeN, 32, AK1, BK1, 32, 32, MXdlPerWave,
-        NXdlPerWave,
-        (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave),
-        (IsColB
-             ? 8
-             : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128)
-                   ? 4
-                   : NXdlPerWave),
-        1, 1>
+    PrintDeviceGemmXdlCShuffle<NumThreads, TileSizeM, TileSizeN, 32, AK1, BK1,
+                               32, 32, MXdlPerWave, NXdlPerWave,
+                               (!IsColA                       ? 8
+                                : (AK1 == 2 || Is_128x128x64) ? 4
+                                                              : MXdlPerWave),
+                               (IsColB ? 8
+                                : (BK1 == 2 || Is_256x128x256 ||
+                                   Is_128x128x128 || Is_128x64x128)
+                                    ? 4
+                                    : NXdlPerWave),
+                               1, 1>
         p;
 #endif  // (DEBUG_CK != 0)
 };
@@ -286,9 +294,9 @@ struct CkGemmConfig<bf16, bf16, bf16, fp32, LayoutA, LayoutB, NumThreads,
     static constexpr auto LogMNXdlPerWave = math::log2_up<MNXdlPerWave>::value;
     static constexpr auto MXdlPerWave =
         (TileSizeM == 16) ? 1
-                          : (TileSizeM < TileSizeN)
-                                ? 1 << (LogMNXdlPerWave / 2)
-                                : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2);
+        : (TileSizeM < TileSizeN)
+            ? 1 << (LogMNXdlPerWave / 2)
+            : 1 << (LogMNXdlPerWave - LogMNXdlPerWave / 2);
     static constexpr auto NXdlPerWave = MNXdlPerWave / MXdlPerWave;
 
     static constexpr bool Is_256x256x128 =
@@ -307,7 +315,8 @@ struct CkGemmConfig<bf16, bf16, bf16, fp32, LayoutA, LayoutB, NumThreads,
         (!IsColA || AK1 == 8 || Is_256x256x128 || Is_128x128x128 ||
          Is_128x128x64)
             ? 4
-            : Is_256x64x128 ? 16 : 8;
+        : Is_256x64x128 ? 16
+                        : 8;
 
     static constexpr bool Is_256x128x256 =
         NumThreads == 256 && TileSizeM == 128 && TileSizeN == 256;
@@ -323,7 +332,8 @@ struct CkGemmConfig<bf16, bf16, bf16, fp32, LayoutA, LayoutB, NumThreads,
         (IsColB || BK1 == 8 || Is_256x128x256 || Is_128x128x128 ||
          Is_128x64x128)
             ? 4
-            : Is_256x128x64 ? 16 : 8;
+        : Is_256x128x64 ? 16
+                        : 8;
 
     using ImplXdlCShuffle =
         ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle<
@@ -334,16 +344,17 @@ struct CkGemmConfig<bf16, bf16, bf16, fp32, LayoutA, LayoutB, NumThreads,
             typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
             typename std::conditional<IsColA, S<0, 2, 1>, S<1, 0, 2>>::type,
             (IsColA ? 1 : 2),
-            (!IsColA ? 8 : (AK1 == 2 || Is_128x128x64) ? 4 : MXdlPerWave), AK1,
-            (AK1 == 8), S<B_Lengths_K0, (NumThreads / B_Lengths_K0), 1>,
+            (!IsColA                       ? 8
+             : (AK1 == 2 || Is_128x128x64) ? 4
+                                           : MXdlPerWave),
+            AK1, (AK1 == 8), S<B_Lengths_K0, (NumThreads / B_Lengths_K0), 1>,
             typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
             typename std::conditional<IsColB, S<1, 0, 2>, S<0, 2, 1>>::type,
             (IsColB ? 2 : 1),
             (IsColB ? 8
-                    : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 ||
-                       Is_128x64x128)
-                          ? 4
-                          : NXdlPerWave),
+             : (BK1 == 2 || Is_256x128x256 || Is_128x128x128 || Is_128x64x128)
+                 ? 4
+                 : NXdlPerWave),
             BK1, (BK1 == 8), 1, 1,
             S<1,
               (Is_128x128x128 || Is_128x64x128 || Is_128x32x128 ||
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
new file mode 100644
index 000000000..13e1751fe
--- /dev/null
+++ b/ark/include/kernels/kernel_template.in
@@ -0,0 +1,65 @@
+// THIS KERNEL IS MACHINE-GENERATED BY ARK.
+#define ARK_WARPS_PER_BLOCK @NUM_WARPS_PER_BLOCK@
+#include "ark_kernels.h"
+using namespace ark;
+
+template <size_t ProcBegin, size_t ProcEnd, size_t ProcStep, size_t ProcCurrent,
+          size_t TaskBegin, size_t TaskEnd, size_t TaskStep, size_t TaskGranularity,
+          size_t NumSlots, size_t SlotNumWarps, size_t SlotSramBytes,
+          void (*task)(char*, int, int)>
+__forceinline__ __device__ void task_seq(char *_buf) {
+  if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
+      ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
+    constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
+    constexpr size_t NumProcs = (ProcEnd - ProcBegin + ProcStep - 1) / ProcStep;
+    constexpr size_t SramBytesPerWarp = SlotSramBytes / SlotNumWarps;
+    size_t p = ((blockIdx.x + gridDim.x - ProcCurrent) % gridDim.x) / ProcStep;
+    size_t k = threadIdx.x / SlotNumThreads;
+    size_t task_id_base = TaskBegin + p * TaskStep * TaskGranularity;
+    for (size_t t = k; ; t += NumSlots) {
+      size_t task_id = task_id_base + TaskStep *
+        (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
+      if (task_id >= TaskEnd) break;
+      task(_buf, task_id, SramBytesPerWarp);
+    }
+    __syncthreads();
+  }
+}
+
+__device__ int ARK_ITER = 0;
+__device__ sync::State ARK_LOOP_SYNC_STATE;
+__device__ char *ARK_BUF;
+
+@DEFINITIONS@
+
+__device__ void ark_loop_body(char *_buf, int _iter) {
+@BODY@
+}
+
+extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
+void @NAME@(int *_iter) {
+  char *_buf = ARK_BUF;
+  int *shared_mem = (int *)_ARK_SMEM;
+  for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
+    shared_mem[i] = 0;
+  }
+  for (;;) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+      int iter;
+      while ((iter = atomicLoadRelaxed(_iter)) == 0) {}
+      ARK_ITER = iter;
+    }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
+    if (ARK_ITER < 0) return;
+
+    ark_loop_body(_buf, 0);
+    for (int _i = 1; _i < ARK_ITER; ++_i) {
+      sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
+      ark_loop_body(_buf, _i);
+    }
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+      atomicStoreRelaxed(_iter, 0);
+    }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
+  }
+}
diff --git a/ark/ark.cc b/ark/init.cpp
similarity index 91%
rename from ark/ark.cc
rename to ark/init.cpp
index 2392dcc83..0e6adcc4f 100644
--- a/ark/ark.cc
+++ b/ark/init.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "include/ark.h"
+#include "ark/init.hpp"
 
 #include <sstream>
 #include <string>
@@ -15,12 +15,6 @@
 
 namespace ark {
 
-std::string version() {
-    std::stringstream ss;
-    ss << ARK_MAJOR << "." << ARK_MINOR << "." << ARK_PATCH;
-    return ss.str();
-}
-
 void init() {
     LOG(DEBUG, "init ark");
 
diff --git a/ark/ark_test.cc b/ark/init_test.cpp
similarity index 50%
rename from ark/ark_test.cc
rename to ark/init_test.cpp
index 66dd6e8c2..5dede7138 100644
--- a/ark/ark_test.cc
+++ b/ark/init_test.cpp
@@ -1,40 +1,28 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "include/ark.h"
+#include "ark/init.hpp"
 
 #include "file_io.h"
 #include "unittest/unittest_utils.h"
 
-ark::unittest::State test_ark_version() {
-    auto version = ark::version();
-
-    // Check if the version string is in the correct format.
-    auto dot1 = version.find('.');
-    auto dot2 = version.find('.', dot1 + 1);
-    UNITTEST_NE(dot1, std::string::npos);
-    UNITTEST_NE(dot2, std::string::npos);
-
-    return ark::unittest::SUCCESS;
-}
-
-ark::unittest::State test_ark_init() {
+ark::unittest::State test_init() {
     // invalid tmp directory
     ::setenv("ARK_TMP", "", 1);
     UNITTEST_THROW(ark::init(), ark::SystemError);
 
     // create a tmp directory
-    ::setenv("ARK_TMP", "/tmp/ark/.test_ark_init", 1);
+    ::setenv("ARK_TMP", "/tmp/ark/.test_init", 1);
     ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     // create a tmp file
-    ark::write_file("/tmp/ark/.test_ark_init/test", "test");
+    ark::write_file("/tmp/ark/.test_init/test", "test");
 
     // clear the tmp directory
     ::setenv("ARK_KEEP_TMP", "0", 1);
     ark::init();
-    UNITTEST_TRUE(!ark::is_exist("/tmp/ark/.test_ark_init/test"));
+    UNITTEST_TRUE(!ark::is_exist("/tmp/ark/.test_init/test"));
 
     // given tmp directory is not a directory
     ::setenv("ARK_TMP", "/dev/null", 1);
@@ -44,7 +32,6 @@ ark::unittest::State test_ark_init() {
 }
 
 int main() {
-    UNITTEST(test_ark_version);
-    UNITTEST(test_ark_init);
+    UNITTEST(test_init);
     return 0;
 }
diff --git a/ark/ipc/ipc_coll.cc b/ark/ipc/ipc_coll.cpp
similarity index 100%
rename from ark/ipc/ipc_coll.cc
rename to ark/ipc/ipc_coll.cpp
diff --git a/ark/ipc/ipc_coll_test.cc b/ark/ipc/ipc_coll_test.cpp
similarity index 96%
rename from ark/ipc/ipc_coll_test.cc
rename to ark/ipc/ipc_coll_test.cpp
index 17ae39326..76adcedec 100644
--- a/ark/ipc/ipc_coll_test.cc
+++ b/ark/ipc/ipc_coll_test.cpp
@@ -3,7 +3,6 @@
 
 #include "ipc/ipc_coll.h"
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_ipc_coll_allgather() {
@@ -31,7 +30,6 @@ ark::unittest::State test_ipc_coll_allgather() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_ipc_coll_allgather);
     return 0;
 }
diff --git a/ark/ipc/ipc_hosts.cc b/ark/ipc/ipc_hosts.cpp
similarity index 98%
rename from ark/ipc/ipc_hosts.cc
rename to ark/ipc/ipc_hosts.cpp
index b41c315b0..a0a8bc417 100644
--- a/ark/ipc/ipc_hosts.cc
+++ b/ark/ipc/ipc_hosts.cpp
@@ -11,7 +11,6 @@
 
 #include "env.h"
 #include "file_io.h"
-#include "include/ark.h"
 #include "logging.h"
 
 namespace ark {
diff --git a/ark/ipc/ipc_hosts_test.cc b/ark/ipc/ipc_hosts_test.cpp
similarity index 97%
rename from ark/ipc/ipc_hosts_test.cc
rename to ark/ipc/ipc_hosts_test.cpp
index 326f1c74b..410032852 100644
--- a/ark/ipc/ipc_hosts_test.cc
+++ b/ark/ipc/ipc_hosts_test.cpp
@@ -5,7 +5,6 @@
 
 #include "env.h"
 #include "file_io.h"
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_ipc_hosts() {
@@ -42,7 +41,6 @@ ark::unittest::State test_ipc_hosts_unknown_host() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_ipc_hosts);
     UNITTEST(test_ipc_hosts_unknown_host);
     return 0;
diff --git a/ark/ipc/ipc_lock.cc b/ark/ipc/ipc_lock.cpp
similarity index 98%
rename from ark/ipc/ipc_lock.cc
rename to ark/ipc/ipc_lock.cpp
index 50381a2e7..d66e91a61 100644
--- a/ark/ipc/ipc_lock.cc
+++ b/ark/ipc/ipc_lock.cpp
@@ -5,7 +5,6 @@
 
 #include <cassert>
 
-#include "include/ark.h"
 #include "logging.h"
 
 namespace ark {
diff --git a/ark/ipc/ipc_mem.cc b/ark/ipc/ipc_mem.cpp
similarity index 99%
rename from ark/ipc/ipc_mem.cc
rename to ark/ipc/ipc_mem.cpp
index babb4df4a..52bd033b1 100644
--- a/ark/ipc/ipc_mem.cc
+++ b/ark/ipc/ipc_mem.cpp
@@ -13,7 +13,6 @@
 
 #include "cpu_timer.h"
 #include "env.h"
-#include "include/ark.h"
 #include "ipc/ipc_shm.h"
 #include "logging.h"
 
diff --git a/ark/ipc/ipc_mem_test.cc b/ark/ipc/ipc_mem_test.cpp
similarity index 99%
rename from ark/ipc/ipc_mem_test.cc
rename to ark/ipc/ipc_mem_test.cpp
index c25f90f88..e151b4d2f 100644
--- a/ark/ipc/ipc_mem_test.cc
+++ b/ark/ipc/ipc_mem_test.cpp
@@ -3,7 +3,6 @@
 
 #include "ipc/ipc_mem.h"
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_ipc_mem_lock_simple() {
@@ -153,7 +152,6 @@ ark::unittest::State test_ipc_mem_realloc() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_ipc_mem_lock_simple);
     UNITTEST(test_ipc_mem_lock_many);
     UNITTEST(test_ipc_mem_finishing);
diff --git a/ark/ipc/ipc_shm.cc b/ark/ipc/ipc_shm.cpp
similarity index 99%
rename from ark/ipc/ipc_shm.cc
rename to ark/ipc/ipc_shm.cpp
index d00075914..fa8641f74 100644
--- a/ark/ipc/ipc_shm.cc
+++ b/ark/ipc/ipc_shm.cpp
@@ -16,7 +16,6 @@
 #include <stdexcept>
 #include <string>
 
-#include "include/ark.h"
 #include "logging.h"
 
 #define SHM_DIR "/dev/shm/"
diff --git a/ark/ipc/ipc_socket.cc b/ark/ipc/ipc_socket.cpp
similarity index 99%
rename from ark/ipc/ipc_socket.cc
rename to ark/ipc/ipc_socket.cpp
index 43f64d9b3..6bb0a7b98 100644
--- a/ark/ipc/ipc_socket.cc
+++ b/ark/ipc/ipc_socket.cpp
@@ -11,7 +11,6 @@
 
 #include <cstring>
 
-#include "include/ark.h"
 #include "logging.h"
 
 #define MAX_LISTEN_LEN 4096
diff --git a/ark/ipc/ipc_socket_test.cc b/ark/ipc/ipc_socket_test.cpp
similarity index 98%
rename from ark/ipc/ipc_socket_test.cc
rename to ark/ipc/ipc_socket_test.cpp
index 26257a235..dd2dc8119 100644
--- a/ark/ipc/ipc_socket_test.cc
+++ b/ark/ipc/ipc_socket_test.cpp
@@ -4,7 +4,6 @@
 #include "ipc/ipc_socket.h"
 
 #include "env.h"
-#include "include/ark.h"
 #include "ipc/ipc_hosts.h"
 #include "logging.h"
 #include "unittest/unittest_utils.h"
@@ -124,7 +123,6 @@ ark::unittest::State test_ipc_socket_no_item() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_ipc_socket_simple);
     UNITTEST(test_ipc_socket_no_item);
     return ark::unittest::SUCCESS;
diff --git a/ark/logging.cc b/ark/logging.cpp
similarity index 100%
rename from ark/logging.cc
rename to ark/logging.cpp
diff --git a/ark/logging.h b/ark/logging.h
index d84795315..d29793ff7 100644
--- a/ark/logging.h
+++ b/ark/logging.h
@@ -8,6 +8,8 @@
 #include <sstream>
 #include <string>
 
+#include "error.hpp"
+
 namespace ark {
 
 typedef enum { DEBUG, INFO, WARN, ERROR } LogLevel;
diff --git a/ark/math_utils.cc b/ark/math_utils.cpp
similarity index 98%
rename from ark/math_utils.cc
rename to ark/math_utils.cpp
index 2f49d870a..3efa2d6a1 100644
--- a/ark/math_utils.cc
+++ b/ark/math_utils.cpp
@@ -3,7 +3,6 @@
 
 #include "math_utils.h"
 
-#include "include/ark.h"
 #include "logging.h"
 
 namespace ark {
diff --git a/ark/math_utils.h b/ark/math_utils.h
index cf1759a39..1780876da 100644
--- a/ark/math_utils.h
+++ b/ark/math_utils.h
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_MATH_H_
-#define ARK_MATH_H_
+#ifndef ARK_MATH_UTILS_H_
+#define ARK_MATH_UTILS_H_
 
 #include <cstddef>
 
@@ -19,4 +19,4 @@ size_t lcm(size_t a, size_t b);
 }  // namespace math
 }  // namespace ark
 
-#endif  // ARK_MATH_H_
+#endif  // ARK_MATH_UTILS_H_
diff --git a/ark/math_utils_test.cc b/ark/math_utils_test.cpp
similarity index 98%
rename from ark/math_utils_test.cc
rename to ark/math_utils_test.cpp
index d56b6c0d6..21c9f47a8 100644
--- a/ark/math_utils_test.cc
+++ b/ark/math_utils_test.cpp
@@ -3,7 +3,6 @@
 
 #include "math_utils.h"
 
-#include "include/ark.h"
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_math() {
@@ -102,7 +101,6 @@ ark::unittest::State test_math() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_math);
     return 0;
 }
diff --git a/ark/model.cc b/ark/model.cc
index 52c17d016..645b4ded4 100644
--- a/ark/model.cc
+++ b/ark/model.cc
@@ -19,19 +19,6 @@
 
 namespace ark {
 
-bool operator==(const OpArgType &lhs, const OpArgType &rhs) {
-    return lhs.id == rhs.id;
-}
-
-bool operator!=(const OpArgType &lhs, const OpArgType &rhs) {
-    return !(lhs == rhs);
-}
-
-std::ostream &operator<<(std::ostream &os, const OpArgType &type) {
-    os << type.name;
-    return os;
-}
-
 OpArchType op_arch_from_string(const std::string &arch) {
     if (arch == "cuda_60") {
         return OP_ARCH_CUDA_60;
@@ -109,216 +96,14 @@ const std::vector<OpConfig> &OpConfigMap::get(const OpConfigKey &key) const {
     return NoneConfigs;
 }
 
-OpArg::OpArg(int arg) : type{OP_ARG_INT}, val{new int{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(DimType arg) : type{OP_ARG_INT64}, val{new DimType{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(uint64_t arg) : type{OP_ARG_UINT64}, val{new uint64_t{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(bool arg) : type{OP_ARG_BOOL}, val{new bool{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(float arg) : type{OP_ARG_FLOAT}, val{new float{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(const Dims &arg) : type{OP_ARG_DIMS}, val{new Dims{arg}} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(Tensor *arg) : type{OP_ARG_TENSOR}, val{arg} {
-    assert(this->val != nullptr);
-}
-OpArg::OpArg(const OpArg &arg) : type{arg.type} {
-    if (this->type == OP_ARG_INT) {
-        this->val = new int{*(int *)arg.val};
-    } else if (this->type == OP_ARG_INT64) {
-        this->val = new DimType{*(DimType *)arg.val};
-    } else if (this->type == OP_ARG_UINT64) {
-        this->val = new uint64_t{*(uint64_t *)arg.val};
-    } else if (this->type == OP_ARG_BOOL) {
-        this->val = new bool{*(bool *)arg.val};
-    } else if (this->type == OP_ARG_FLOAT) {
-        this->val = new float{*(float *)arg.val};
-    } else if (this->type == OP_ARG_DIMS) {
-        this->val = new Dims{*(Dims *)arg.val};
-    } else if (this->type == OP_ARG_TENSOR) {
-        this->val = arg.val;
-    } else {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-}
-OpArg::~OpArg() {
-    if (this->type == OP_ARG_INT) {
-        delete static_cast<int *>(this->val);
-    } else if (this->type == OP_ARG_INT64) {
-        delete static_cast<DimType *>(this->val);
-    } else if (this->type == OP_ARG_UINT64) {
-        delete static_cast<uint64_t *>(this->val);
-    } else if (this->type == OP_ARG_BOOL) {
-        delete static_cast<bool *>(this->val);
-    } else if (this->type == OP_ARG_FLOAT) {
-        delete static_cast<float *>(this->val);
-    } else if (this->type == OP_ARG_DIMS) {
-        delete static_cast<Dims *>(this->val);
-    } else if (this->type == OP_ARG_TENSOR) {
-        // Do nothing
-    }
-}
-void OpArg::get(int *arg) const {
-    if (this->type != OP_ARG_INT) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<int *>(this->val);
-}
-
-void OpArg::get(long long int *arg) const {
-    if (this->type != OP_ARG_INT64) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<long long int *>(this->val);
-}
-
-void OpArg::get(uint64_t *arg) const {
-    if (this->type != OP_ARG_UINT64) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<uint64_t *>(this->val);
-}
-
-void OpArg::get(bool *arg) const {
-    if (this->type != OP_ARG_BOOL) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<bool *>(this->val);
-}
-
-void OpArg::get(float *arg) const {
-    if (this->type != OP_ARG_FLOAT) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<float *>(this->val);
-}
-
-void OpArg::get(Dims *arg) const {
-    if (this->type != OP_ARG_DIMS) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = *static_cast<Dims *>(this->val);
-}
-
-void OpArg::get(Tensor **arg) const {
-    if (this->type != OP_ARG_TENSOR) {
-        ERR(InvalidUsageError, "invalid argument type ", this->type.name);
-    }
-    *arg = static_cast<Tensor *>(this->val);
-}
-
-OpArgs::OpArgs(const std::vector<OpArg> &args) : args{args} {}
-
-OpArgs &OpArgs::operator=(const OpArgs &opargs) {
-    if (this != &opargs) {
-        this->args = opargs.args;
-    }
-    return *this;
-}
-
-void OpArgs::put(const OpArg &arg) { this->args.emplace_back(arg); }
-
-void OpArgs::get(int *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_INT) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<int *>(this->args[idx].val);
-}
-
-void OpArgs::get(long long int *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_INT64) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<long long int *>(this->args[idx].val);
-}
-
-void OpArgs::get(uint64_t *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_UINT64) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<uint64_t *>(this->args[idx].val);
-}
-
-void OpArgs::get(bool *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_BOOL) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<bool *>(this->args[idx].val);
-}
-
-void OpArgs::get(float *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_FLOAT) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<float *>(this->args[idx].val);
-}
-
-void OpArgs::get(Dims *arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_DIMS) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = *static_cast<Dims *>(this->args[idx].val);
-}
-
-void OpArgs::get(Tensor **arg, size_t idx) const {
-    if (this->args.size() <= idx) {
-        ERR(InvalidUsageError, "invalid argument index ", idx, " size ",
-            this->args.size());
-    }
-    if (this->args[idx].type != OP_ARG_TENSOR) {
-        ERR(InvalidUsageError, "invalid argument type ",
-            this->args[idx].type.name);
-    }
-    *arg = static_cast<Tensor *>(this->args[idx].val);
-}
-
-const std::vector<OpArg> &OpArgs::get_args() const { return this->args; }
-
 bool operator==(const OpType &lhs, const OpType &rhs) {
     return lhs.id == rhs.id;
 }
 
 Op::Op(const OpType &type_, const std::string &prec_type_,
        const std::vector<Tensor *> &inputs_,
-       const std::vector<Tensor *> &output_refs_, const OpArgs &args_,
+       const std::vector<Tensor *> &output_refs_,
+       const std::map<std::string, ModelOpArgT> &args_,
        const std::string &name_, const OpConfigMap *cfg_map_, int gran_lev_,
        bool force_inline_)
     : type{type_},
@@ -422,7 +207,7 @@ std::string Op::function_name(const OpConfig &cfg) const {
     return "";
 }
 
-OpArgs Op::function_call_args(const OpConfig &cfg) const {
+std::vector<ModelOpArgT> Op::function_call_args(const OpConfig &cfg) const {
     if (this->type.id == OP_SCALE.id) {
         return static_cast<const ScaleOp *>(this)->function_call_args(cfg);
     } else if (this->type.id == OP_SEND.id) {
@@ -448,11 +233,11 @@ OpArgs Op::function_call_args(const OpConfig &cfg) const {
         return static_cast<const GetFromPacketOp *>(this)->function_call_args(
             cfg);
     } else {
-        OpArgs opargs;
+        std::vector<ModelOpArgT> opargs;
         std::vector<Tensor *> deps = this->outputs;
         deps.insert(deps.end(), this->inputs.begin(), this->inputs.end());
         for (Tensor *tns : deps) {
-            opargs.put(tns);
+            opargs.emplace_back(ModelOpArg<Tensor *>(tns));
         }
         return opargs;
     }
@@ -461,38 +246,39 @@ OpArgs Op::function_call_args(const OpConfig &cfg) const {
 }
 
 std::string Op::function_name(const std::string &kernel_name,
-                              const OpArgs &template_args) {
+                              const std::vector<ModelOpArgT> &template_args) {
     std::stringstream ss;
     ss << kernel_name;
-    size_t num_args = template_args.args.size();
+    size_t num_args = template_args.size();
     if (num_args == 0) {
         return ss.str();
     }
     ss << "<";
     for (size_t i = 0; i < num_args; ++i) {
-        auto &arg = template_args.args[i];
-        if (arg.type == OP_ARG_INT) {
+        auto &arg = template_args[i];
+        if (arg.type_name() == "INT") {
             int val;
-            template_args.get(&val, i);
+            arg.get_value(&val);
             ss << val;
-        } else if (arg.type == OP_ARG_INT64) {
+        } else if (arg.type_name() == "INT64") {
             long long int val;
-            template_args.get(&val, i);
+            arg.get_value(&val);
             ss << val;
-        } else if (arg.type == OP_ARG_UINT64) {
+        } else if (arg.type_name() == "UINT64") {
             uint64_t val;
-            template_args.get(&val, i);
+            arg.get_value(&val);
             ss << val;
-        } else if (arg.type == OP_ARG_BOOL) {
+        } else if (arg.type_name() == "BOOL") {
             bool val;
-            template_args.get(&val, i);
+            arg.get_value(&val);
             ss << (val ? "true" : "false");
-        } else if (arg.type == OP_ARG_FLOAT) {
+        } else if (arg.type_name() == "FLOAT") {
             ERR(ModelError, "float template args are not supported");
-        } else if (arg.type == OP_ARG_DIMS) {
+        } else if (arg.type_name() == "DIMS") {
             Dims val;
-            template_args.get(&val, i);
+            arg.get_value(&val);
             ss << "ark::Vec" << val;
+        } else {
         }
         if (i < num_args - 1) {
             ss << ", ";
@@ -542,8 +328,8 @@ void Model::Impl::destroy_tensor_buf(const TensorBuf *buf) {
 std::vector<Tensor *> Model::Impl::add_op(
     const OpType type, const std::string &prec_type,
     const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-    const OpArgs &args, const std::string &name, const OpConfigMap *cfg_map,
-    int gran_lev) {
+    const std::map<std::string, ModelOpArgT> &args, const std::string &name,
+    const OpConfigMap *cfg_map, int gran_lev) {
     Op op{type, prec_type, inputs, outputs, args, name, cfg_map, gran_lev};
     return this->add_op(op);
 }
@@ -1163,6 +949,17 @@ bool Model::Impl::depends_on(OpNode *node1, OpNode *node2) const {
     return false;
 }
 
+nlohmann::json to_json(const Dims &dims) {
+    if (dims.is_invalid()) {
+        ERR(InvalidUsageError, "invalid dims given");
+    }
+    nlohmann::json j;
+    for (auto i = 0; i < dims.ndims(); ++i) {
+        j[i] = dims[i];
+    }
+    return j;
+}
+
 nlohmann::json to_json(const TensorBuf &tensor_buf) {
     nlohmann::json j;
     j["Id"] = tensor_buf.id;
@@ -1173,51 +970,54 @@ nlohmann::json to_json(const TensorBuf &tensor_buf) {
 nlohmann::json to_json(const Tensor &tensor) {
     nlohmann::json j;
     j["Id"] = tensor.id;
-    j["TensorBuf"] = to_json(*(tensor.buf));
+    j["BufferId"] = tensor.buf->id;
     j["TensorType"] = tensor.type.type_str();
-    j["Shape"] = tensor.shape.serialize();
-    j["Strides"] = tensor.ldims.serialize();
-    j["Offsets"] = tensor.offs.serialize();
+    j["Shape"] = to_json(tensor.shape);
+    if (tensor.shape != tensor.ldims) {
+        j["Strides"] = to_json(tensor.ldims);
+    }
+    if (!tensor.offs.is_zeros()) {
+        j["Offsets"] = to_json(tensor.offs);
+    }
     if (tensor.imported_rank >= 0) {
         j["ImportedRank"] = tensor.imported_rank;
     }
     return j;
 }
 
-nlohmann::json to_json(const OpArg &op_arg) {
+nlohmann::json to_json(const ModelOpArgT &op_arg) {
     nlohmann::json j;
-    j["Type"] = op_arg.type.name;
-    if (op_arg.type == OP_ARG_TENSOR) {
+    auto type_name = op_arg.type_name();
+    if (type_name == "TENSOR") {
         Tensor *tns;
-        op_arg.get(&tns);
-        j["Value"] = tns->id;
-    } else if (op_arg.type == OP_ARG_FLOAT) {
+        op_arg.get_value(&tns);
+        j[type_name] = tns->id;
+    } else if (type_name == "FLOAT") {
         float val;
-        op_arg.get(&val);
-        j["Value"] = val;
-    } else if (op_arg.type == OP_ARG_INT) {
+        op_arg.get_value(&val);
+        j[type_name] = val;
+    } else if (type_name == "INT") {
         int val;
-        op_arg.get(&val);
-        j["Value"] = val;
-    } else if (op_arg.type == OP_ARG_BOOL) {
+        op_arg.get_value(&val);
+        j[type_name] = val;
+    } else if (type_name == "BOOL") {
         bool val;
-        op_arg.get(&val);
-        j["Value"] = val;
-    } else if (op_arg.type == OP_ARG_INT64) {
+        op_arg.get_value(&val);
+        j[type_name] = val;
+    } else if (type_name == "INT64") {
         long long int val;
-        op_arg.get(&val);
-        j["Value"] = val;
-    } else if (op_arg.type == OP_ARG_UINT64) {
+        op_arg.get_value(&val);
+        j[type_name] = val;
+    } else if (type_name == "UINT64") {
         uint64_t val;
-        op_arg.get(&val);
-        j["Value"] = val;
-    } else if (op_arg.type == OP_ARG_DIMS) {
+        op_arg.get_value(&val);
+        j[type_name] = val;
+    } else if (type_name == "DIMS") {
         Dims dims;
-        op_arg.get(&dims);
-        j["Value"] = dims.serialize();
+        op_arg.get_value(&dims);
+        j[type_name] = to_json(dims);
     } else {
-        throw std::runtime_error("unexpected OpArg: " +
-                                 std::string(op_arg.type.name));
+        throw std::runtime_error("unexpected OpArg: " + type_name);
     }
     return j;
 }
@@ -1225,22 +1025,22 @@ nlohmann::json to_json(const OpArg &op_arg) {
 nlohmann::json to_json(const Op &op) {
     nlohmann::json j;
     j["Type"] = op.type.name;
-    j["PrecisionType"] = op.prec_type;
-    j["InputTensors"] = nlohmann::json();
+    j["Precision"] = op.prec_type;
+    j["InputTensorIds"] = nlohmann::json::array();
     for (auto tensor : op.inputs) {
-        j["InputTensors"].emplace_back(to_json(*tensor));
+        j["InputTensorIds"].emplace_back(tensor->id);
     }
-    j["OutputTensors"] = nlohmann::json();
+    j["OutputTensorIds"] = nlohmann::json::array();
     for (auto tensor : op.inputs) {
-        j["OutputTensors"].emplace_back(to_json(*tensor));
+        j["OutputTensorIds"].emplace_back(tensor->id);
     }
-    j["OutputRefTensors"] = nlohmann::json();
+    j["OutputRefTensorIds"] = nlohmann::json::array();
     for (auto tensor : op.inputs) {
-        j["OutputRefTensors"].emplace_back(to_json(*tensor));
+        j["OutputRefTensorIds"].emplace_back(tensor->id);
     }
-    j["Args"] = nlohmann::json();
-    for (auto arg : op.args.get_args()) {
-        j["Args"].emplace_back(to_json(arg));
+    j["Args"] = nlohmann::json::object();
+    for (const auto &p : op.args) {
+        j["Args"][p.first] = to_json(p.second);
     }
     return j;
 }
@@ -1249,15 +1049,11 @@ nlohmann::json to_json(const OpNode &node,
                        const std::map<const OpNode *, size_t> &node2id) {
     nlohmann::json j;
     j["Id"] = node2id.at(&node);
-    j["Ops"] = nlohmann::json();
+    j["Ops"] = nlohmann::json::array();
     for (auto op : node.ops) {
         j["Ops"].emplace_back(to_json(*op));
     }
-    j["ConsumerNodeIds"] = nlohmann::json();
-    for (auto user : node.users) {
-        j["ConsumerNodeIds"].emplace_back(node2id.at(user));
-    }
-    j["ProducerNodeIds"] = nlohmann::json();
+    j["ProducerNodeIds"] = nlohmann::json::array();
     for (auto producer : node.producers) {
         j["ProducerNodeIds"].emplace_back(node2id.at(producer));
     }
@@ -1271,10 +1067,18 @@ std::string Model::Impl::serialize(int indent) const {
         node2id[node.get()] = id++;
     }
     nlohmann::json j;
-    j["Nodes"] = nlohmann::json();
+    j["Nodes"] = nlohmann::json::array();
     for (const auto &node : this->get_nodes()) {
         j["Nodes"].emplace_back(to_json(*node, node2id));
     }
+    j["TensorBufs"] = nlohmann::json::array();
+    for (const auto &tbuf : this->tns_bufs_storage) {
+        j["TensorBufs"].emplace_back(to_json(*tbuf));
+    }
+    j["Tensors"] = nlohmann::json::array();
+    for (const auto &tns : this->tns_storage) {
+        j["Tensors"].emplace_back(to_json(*tns));
+    }
     return j.dump(indent);
 }
 
diff --git a/ark/model.h b/ark/model.h
index f97aae4ec..63f50d848 100644
--- a/ark/model.h
+++ b/ark/model.h
@@ -48,7 +48,8 @@ class Model::Impl {
                                  const std::string &prec_type,
                                  const std::vector<Tensor *> &inputs,
                                  const std::vector<Tensor *> &output_refs,
-                                 const OpArgs &args, const std::string &name,
+                                 const std::map<std::string, ModelOpArgT> &args,
+                                 const std::string &name,
                                  const OpConfigMap *cfg_map, int gran_lev = -1);
 
     /// Add a new @ref Op to the model.
diff --git a/ark/model/model.cpp b/ark/model/model.cpp
new file mode 100644
index 000000000..e3111ad45
--- /dev/null
+++ b/ark/model/model.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+
+namespace ark {
+
+Model Model::compress() const {
+    Model model(*this);
+    model.compress_nodes();
+    return model;
+}
+
+}  // namespace ark
diff --git a/ark/model/model_data_type.cpp b/ark/model/model_data_type.cpp
new file mode 100644
index 000000000..a62873ab2
--- /dev/null
+++ b/ark/model/model_data_type.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_data_type.hpp"
+
+#include <map>
+
+#include "bfloat16.h"
+#include "half.h"
+#include "logging.h"
+
+namespace ark {
+
+///
+/// NOTE: how to add a new data type
+///   1. Add an instance using `MODEL_DATA_TYPE_INSTANCE()` macro.
+///   2. Add a registration using `MODEL_DATA_TYPE_REGISTER()` macro.
+///   3. Expose the symbol in `include/ark/model.hpp`.
+///
+
+#define MODEL_DATA_TYPE_INSTANCE(_name, _type) \
+    extern const ModelDataType _name =         \
+        std::make_shared<ModelDataT>(#_name, #_type, sizeof(_type));
+
+#define MODEL_DATA_TYPE_REGISTER(_name) instances[#_name] = _name;
+
+extern const ModelDataType NONE =
+    std::make_shared<ModelDataT>("NONE", "void", 0);
+MODEL_DATA_TYPE_INSTANCE(FP32, float);
+MODEL_DATA_TYPE_INSTANCE(FP16, fp16);
+MODEL_DATA_TYPE_INSTANCE(BF16, bf16);
+MODEL_DATA_TYPE_INSTANCE(INT32, int32_t);
+MODEL_DATA_TYPE_INSTANCE(UINT32, uint32_t);
+MODEL_DATA_TYPE_INSTANCE(INT8, int8_t);
+MODEL_DATA_TYPE_INSTANCE(UINT8, uint8_t);
+MODEL_DATA_TYPE_INSTANCE(BYTE, char);
+
+const ModelDataType ModelDataT::from_name(const std::string &type_name) {
+    static std::map<std::string, ModelDataType> instances;
+    if (instances.empty()) {
+        MODEL_DATA_TYPE_REGISTER(NONE);
+        MODEL_DATA_TYPE_REGISTER(FP32);
+        MODEL_DATA_TYPE_REGISTER(FP16);
+        MODEL_DATA_TYPE_REGISTER(BF16);
+        MODEL_DATA_TYPE_REGISTER(INT32);
+        MODEL_DATA_TYPE_REGISTER(UINT32);
+        MODEL_DATA_TYPE_REGISTER(INT8);
+        MODEL_DATA_TYPE_REGISTER(UINT8);
+        MODEL_DATA_TYPE_REGISTER(BYTE);
+    }
+    auto it = instances.find(type_name);
+    if (it == instances.end()) {
+        ERR(InvalidUsageError, "Unknown data type: ", type_name);
+    }
+    return it->second;
+}
+
+}  // namespace ark
diff --git a/ark/model/model_data_type.hpp b/ark/model/model_data_type.hpp
new file mode 100644
index 000000000..c7127a34a
--- /dev/null
+++ b/ark/model/model_data_type.hpp
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_DATA_TYPE_HPP_
+#define ARK_MODEL_DATA_TYPE_HPP_
+
+#include <memory>
+#include <string>
+
+#include "named_type.hpp"
+
+namespace ark {
+
+class ModelDataT;
+using ModelDataType = std::shared_ptr<ModelDataT>;
+
+class ModelDataT : public NamedT {
+   public:
+    ModelDataT(const std::string &type_name, const std::string &type_str,
+               size_t bytes)
+        : NamedT(type_name), type_str_(type_str), bytes_(bytes) {}
+
+    ModelDataT(const ModelDataT &) = default;
+
+    const std::string &type_str() const { return type_str_; }
+
+    size_t bytes() const { return bytes_; }
+
+    static const ModelDataType from_name(const std::string &type_name);
+
+   private:
+    std::string type_str_;
+    size_t bytes_;
+};
+
+using ModelDataType = std::shared_ptr<ModelDataT>;
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_DATA_TYPE_HPP_
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
new file mode 100644
index 000000000..c97d048ea
--- /dev/null
+++ b/ark/model/model_graph_impl.cpp
@@ -0,0 +1,521 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_graph_impl.hpp"
+
+#include "logging.h"
+#include "model_node.hpp"
+#include "model_tensor.hpp"
+
+#define DEBUG_MODEL_GRAPH 0
+#define MODEL_GRAPH_DEBUG(...)       \
+    do {                             \
+        if (DEBUG_MODEL_GRAPH) {     \
+            LOG(DEBUG, __VA_ARGS__); \
+        }                            \
+    } while (0);
+
+namespace ark {
+
+ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; }
+
+ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
+    std::map<ModelNodeRef, ModelNodeRef> node_map;
+    nodes_.clear();
+    for (const auto &node : other.nodes_) {
+        ModelNodeRef new_node = std::make_shared<ModelNode>();
+        new_node->ops = node->ops;
+        node_map.emplace(node, new_node);
+        nodes_.push_back(new_node);
+    }
+    for (const auto &node : other.nodes_) {
+        auto it = node_map.find(node);
+        if (it == node_map.end()) {
+            ERR(ModelError, "unexpected error");
+        }
+        ModelNodeRef new_node = it->second;
+        for (auto &producer : node->producers) {
+            auto it2 = node_map.find(producer);
+            if (it2 == node_map.end()) {
+                ERR(ModelError, "unexpected error");
+            }
+            new_node->producers.push_back(it2->second);
+        }
+        for (auto &consumer : node->consumers) {
+            auto it2 = node_map.find(consumer);
+            if (it2 == node_map.end()) {
+                ERR(ModelError, "unexpected error");
+            }
+            new_node->consumers.push_back(it2->second);
+        }
+    }
+    op_to_node_.clear();
+    for (const auto &p : other.op_to_node_) {
+        auto it = node_map.find(p.second);
+        if (it == node_map.end()) {
+            ERR(ModelError, "unexpected error");
+        }
+        op_to_node_[p.first] = it->second;
+    }
+    tensor_to_producer_op_ = other.tensor_to_producer_op_;
+    return *this;
+}
+
+ModelNodeRef ModelGraph::Impl::break_node(ModelNodeRef node, size_t op_idx) {
+    if (op_idx == 0) {
+        return node;
+    }
+    if (op_idx >= node->ops.size()) {
+        ERR(ModelError, "unexpected error: op_idx out of range");
+    }
+    ModelNodeRef new_node = std::make_shared<ModelNode>();
+    nodes_.push_back(new_node);
+    new_node->ops.insert(new_node->ops.end(), node->ops.begin() + op_idx,
+                         node->ops.end());
+    for (auto &op : new_node->ops) {
+        op_to_node_[op] = new_node;
+    }
+    new_node->consumers = node->consumers;
+    new_node->producers.push_back(node);
+    for (auto &consumer : node->consumers) {
+        consumer->producers.erase(node);
+        consumer->producers.push_back(new_node);
+    }
+    node->ops.erase(node->ops.begin() + op_idx, node->ops.end());
+    node->consumers.clear();
+    node->consumers.push_back(new_node);
+    return new_node;
+}
+
+void ModelGraph::Impl::compress_nodes() {
+    this->recursive_remove_virtual_nodes();
+    this->recursive_merge_nodes();
+}
+
+bool ModelGraph::Impl::verify() const {
+    for (auto &node : nodes_) {
+        if (node->ops.size() == 0) {
+            LOG(DEBUG, "node has no ops");
+            return false;
+        }
+        for (auto &op : node->ops) {
+            if (op_to_node_.find(op) == op_to_node_.end()) {
+                LOG(DEBUG, "op has not been added to the graph");
+                return false;
+            }
+            if (op_to_node_.at(op) != node) {
+                LOG(DEBUG, "op is not in the correct node");
+                return false;
+            }
+            op->verify();
+            for (auto &tns : op->result_tensors()) {
+                if (tensor_to_producer_op_.find(tns) ==
+                    tensor_to_producer_op_.end()) {
+                    LOG(DEBUG, "result tensor has not been produced by any op");
+                    return false;
+                }
+                if (tensor_to_producer_op_.at(tns) != op) {
+                    LOG(DEBUG, "result tensor has been produced by another op");
+                    return false;
+                }
+            }
+            for (auto &tns : op->input_tensors()) {
+                if (tensor_to_producer_op_.find(tns) ==
+                    tensor_to_producer_op_.end()) {
+                    LOG(DEBUG, "input tensor has not been produced by any op");
+                    return false;
+                }
+            }
+        }
+        for (auto &producer : node->producers) {
+            if (producer->consumers.find(node) == producer->consumers.end()) {
+                LOG(DEBUG, "producer does not have this node as consumer");
+                return false;
+            }
+        }
+        for (auto &consumer : node->consumers) {
+            if (consumer->producers.find(node) == consumer->producers.end()) {
+                LOG(DEBUG, "consumer does not have this node as producer");
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
+    for (auto &tns : op->input_tensors()) {
+        if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) {
+            // This tensor has not been produced by any op - assume it is a
+            // Tensor op.
+            ModelOpRef tensor_op = std::make_shared<ModelOp>("Tensor", true);
+            tensor_op->result_tensors_ = {tns};
+            this->add_op(tensor_op);
+        }
+    }
+    for (auto &tns : op->result_tensors()) {
+        if (tensor_to_producer_op_.find(tns) != tensor_to_producer_op_.end()) {
+            ERR(ModelError, "Tensor has already been produced by an op. ",
+                tns->serialize().dump(), "; ",
+                tensor_to_producer_op_.at(tns)->serialize().dump());
+        }
+        tensor_to_producer_op_.emplace(tns, op);
+    }
+
+    ModelNodeRef node = std::make_shared<ModelNode>();
+    node->ops.push_back(op);
+    op_to_node_[op] = node;
+
+    for (auto &tns : op->input_tensors()) {
+        auto it = tensor_to_producer_op_.find(tns);
+        if (it == tensor_to_producer_op_.end()) {
+            ERR(ModelError, "Tensor has not been produced by any op. ",
+                tns->serialize().dump(), " ", tns.get());
+        }
+        auto it2 = op_to_node_.find(it->second);
+        if (it2 == op_to_node_.end()) {
+            ERR(ModelError, "Op has not been added to the graph");
+        }
+        auto producer = it2->second;
+        node->producers.push_back(producer);
+        producer->consumers.push_back(node);
+    }
+
+    nodes_.push_back(node);
+    return node;
+}
+
+void ModelGraph::Impl::remove_node(ModelNodeRef node) {
+    auto it = nodes_.find(node);
+    if (it == nodes_.end()) {
+        ERR(ModelError, "attempted to remove a node that is not in the graph");
+    }
+    // Remove node from consumers and producers.
+    for (auto &consumer : node->consumers) {
+        consumer->producers.erase(node);
+    }
+    for (auto &producer : node->producers) {
+        producer->consumers.erase(node);
+    }
+    // Connect consumers and producers.
+    for (auto &consumer : node->consumers) {
+        for (auto &producer : node->producers) {
+            consumer->producers.push_back(producer);
+            producer->consumers.push_back(consumer);
+        }
+    }
+    for (auto &op : node->ops) {
+        auto it = op_to_node_.find(op);
+        if (it == op_to_node_.end()) {
+            ERR(ModelError, "unexpected error");
+        }
+        if (it->second == node) {
+            op_to_node_.erase(it);
+        }
+    }
+    nodes_.erase(it);
+}
+
+bool ModelGraph::Impl::depends_on(ModelNodeRef node1,
+                                  ModelNodeRef node2) const {
+    if (node1 == node2) {
+        return false;
+    }
+    std::set<ModelNodeRef> seen_nodes;
+    std::vector<ModelNodeRef> boundary_nodes;
+    boundary_nodes.emplace_back(node1);
+    while (boundary_nodes.size() > 0) {
+        std::vector<ModelNodeRef> new_boundary_nodes;
+        for (auto &boundary_node : boundary_nodes) {
+            if (boundary_node == node2) {
+                return true;
+            }
+            for (auto &producer : boundary_node->producers) {
+                if (seen_nodes.find(producer) != seen_nodes.end()) {
+                    continue;
+                }
+                new_boundary_nodes.emplace_back(producer);
+            }
+        }
+        boundary_nodes = new_boundary_nodes;
+    }
+    return false;
+}
+
+void ModelGraph::Impl::recursive_remove_virtual_nodes() {
+    std::vector<ModelNodeRef> leaf_nodes;
+    for (auto &node : nodes_) {
+        if (node->consumers.empty()) {
+            leaf_nodes.emplace_back(node);
+        }
+    }
+    UniqueList<ModelNodeRef> seen_nodes;
+    this->recursive_remove_virtual_nodes(seen_nodes, leaf_nodes);
+}
+
+void ModelGraph::Impl::recursive_remove_virtual_nodes(
+    UniqueList<ModelNodeRef> &seen_nodes,
+    const std::vector<ModelNodeRef> &boundary_nodes) {
+    if (boundary_nodes.size() == 0) {
+        return;
+    }
+    MODEL_GRAPH_DEBUG("remove virtual nodes");
+    std::vector<ModelNodeRef> new_boundary_nodes;
+    for (auto &boundary_node : boundary_nodes) {
+        if (boundary_node->ops.size() == 0) {
+            ERR(ModelError, "unexpected error: empty node");
+        } else if (boundary_node->ops.size() > 1) {
+            ERR(ModelError, "unexpected error: multiple ops in node");
+        }
+        MODEL_GRAPH_DEBUG("  boundary node");
+        MODEL_GRAPH_DEBUG("    node: ", to_json(boundary_node).dump());
+        for (auto &producer : boundary_node->producers) {
+            // Exception: if any consumer of the producer (rather than the
+            // current boundary_node) is unseen, we should not add the producer
+            // to the next boundary.
+            bool should_add = true;
+            for (auto &consumer : producer->consumers) {
+                if (consumer == boundary_node) {
+                    continue;
+                }
+                if (!seen_nodes.contains(consumer)) {
+                    should_add = false;
+                    break;
+                }
+            }
+            if (!should_add) {
+                continue;
+            }
+            if (seen_nodes.contains(producer)) {
+                ERR(ModelError,
+                    "circular dependency detected: ", to_json(producer).dump());
+            }
+            MODEL_GRAPH_DEBUG("      added to next boundary: ",
+                              to_json(producer).dump());
+            new_boundary_nodes.emplace_back(producer);
+        }
+        if (boundary_node->ops[0]->is_virtual()) {
+            MODEL_GRAPH_DEBUG("    remove node: ",
+                              to_json(boundary_node).dump());
+            // Remove this node from the graph.
+            this->remove_node(boundary_node);
+            MODEL_GRAPH_DEBUG("      nodes.size() ", nodes_.size());
+        } else {
+            seen_nodes.push_back(boundary_node);
+        }
+    }
+    this->recursive_remove_virtual_nodes(seen_nodes, new_boundary_nodes);
+}
+
+void ModelGraph::Impl::recursive_merge_nodes() {
+    std::vector<ModelNodeRef> leaf_nodes;
+    for (auto &node : nodes_) {
+        if (node->consumers.empty()) {
+            leaf_nodes.emplace_back(node);
+        }
+    }
+    UniqueList<ModelNodeRef> seen_nodes;
+    this->recursive_merge_nodes(seen_nodes, leaf_nodes);
+}
+
+void ModelGraph::Impl::recursive_merge_nodes(
+    UniqueList<ModelNodeRef> &seen_nodes,
+    const std::vector<ModelNodeRef> &boundary_nodes) {
+    if (boundary_nodes.size() == 0) {
+        return;
+    }
+    MODEL_GRAPH_DEBUG("merge ops");
+    std::vector<ModelNodeRef> new_boundary_nodes;
+    for (auto &boundary_node : boundary_nodes) {
+        MODEL_GRAPH_DEBUG("  boundary node");
+        MODEL_GRAPH_DEBUG("    node: ", to_json(boundary_node).dump());
+        if (boundary_node->producers.size() == 0) {
+            // This node is a root.
+            seen_nodes.push_back(boundary_node);
+            MODEL_GRAPH_DEBUG("    root");
+            continue;
+        }
+        // Add all producers of this node to the next boundary.
+        for (auto &producer : boundary_node->producers) {
+            // Exception: if any consumer of the producer (rather than the
+            // current boundary_node) is unseen, we should not add the producer
+            // to the next boundary.
+            bool should_add = true;
+            for (auto &consumer : producer->consumers) {
+                if (consumer == boundary_node) {
+                    continue;
+                }
+                if (!seen_nodes.contains(consumer)) {
+                    should_add = false;
+                    break;
+                }
+            }
+            if (!should_add) {
+                continue;
+            }
+            if (seen_nodes.contains(producer)) {
+                ERR(ModelError,
+                    "unexpected error: circular dependency detected");
+            }
+            new_boundary_nodes.emplace_back(producer);
+        }
+        ModelNodeRef merge_candidate;
+        if (boundary_node->producers.size() > 1) {
+            // This node has multiple producers. We can merge only if one
+            // producer depends on all other producers.
+            for (auto &producer : boundary_node->producers) {
+                bool depends_on_all = true;
+                for (auto &other_producer : boundary_node->producers) {
+                    if (other_producer == producer) {
+                        continue;
+                    }
+                    if (!this->depends_on(producer, other_producer)) {
+                        depends_on_all = false;
+                        break;
+                    }
+                }
+                if (depends_on_all) {
+                    merge_candidate = producer;
+                    break;
+                }
+            }
+            if (!merge_candidate) {
+                // At least one producer does not depend on others.
+                // Cannot merge.
+                seen_nodes.push_back(boundary_node);
+                MODEL_GRAPH_DEBUG("    multiple producers");
+                continue;
+            }
+        } else {
+            // This node has only one producer.
+            merge_candidate = *(boundary_node->producers.begin());
+        }
+        if (merge_candidate->consumers.size() == 0) {
+            ERR(ModelError, "unexpected error: graph is incomplete");
+        }
+        if (merge_candidate->consumers.size() > 1) {
+            // The candidate has multiple consumers. We can merge only if all
+            // other consumers depend on the current boundary_node.
+            bool depends_on_one = true;
+            for (auto &consumer : merge_candidate->consumers) {
+                if (consumer == boundary_node) {
+                    continue;
+                }
+                if (!this->depends_on(consumer, boundary_node)) {
+                    depends_on_one = false;
+                    break;
+                }
+            }
+            if (!depends_on_one) {
+                // At least one consumer does not depend on the boundary_node.
+                // Cannot merge.
+                seen_nodes.push_back(boundary_node);
+                MODEL_GRAPH_DEBUG("    multiple consumers");
+                continue;
+            }
+        }
+        // We can merge the two nodes.
+        // Merge `boundary_node` into `merge_candidate`.
+        MODEL_GRAPH_DEBUG("  merge: ", to_json(merge_candidate).dump(), " -> ",
+                          to_json(boundary_node).dump());
+        auto &ops = boundary_node->ops;
+        merge_candidate->ops.insert(merge_candidate->ops.end(), ops.begin(),
+                                    ops.end());
+        for (auto &op : ops) {
+            op_to_node_[op] = merge_candidate;
+        }
+        for (auto &consumer : boundary_node->consumers) {
+            consumer->producers.erase(boundary_node);
+            consumer->producers.push_back(merge_candidate);
+            merge_candidate->consumers.push_back(consumer);
+        }
+        for (auto &producer : boundary_node->producers) {
+            if (producer == merge_candidate) {
+                continue;
+            }
+            producer->consumers.erase(boundary_node);
+            producer->consumers.push_back(merge_candidate);
+            merge_candidate->producers.push_back(producer);
+        }
+        merge_candidate->consumers.erase(boundary_node);
+
+        // Remove `boundary_node` from `nodes_`.
+        auto it = nodes_.find(boundary_node);
+        if (it == nodes_.end()) {
+            ERR(ModelError, "unexpected error");
+        }
+        nodes_.erase(it);
+
+        // Since producer is already in the next boundary and boundary_node is
+        // merged into producer, we don't need to add anything to
+        // seen_nodes here.
+    }
+    this->recursive_merge_nodes(seen_nodes, new_boundary_nodes);
+}
+
+nlohmann::ordered_json ModelGraph::Impl::to_json(
+    const ModelNodeRef &node) const {
+    nlohmann::ordered_json j;
+    j["Id"] = nodes_.index(node);
+    j["ProducerNodeIds"] = nlohmann::json::array();
+    for (auto producer : node->producers) {
+        j["ProducerNodeIds"].emplace_back(nodes_.index(producer));
+    }
+    j["ConsumerNodeIds"] = nlohmann::json::array();
+    for (auto consumer : node->consumers) {
+        j["ConsumerNodeIds"].emplace_back(nodes_.index(consumer));
+    }
+    j["Ops"] = nlohmann::json::array();
+    for (auto op : node->ops) {
+        j["Ops"].emplace_back(op->serialize());
+    }
+    return j;
+}
+
+std::string ModelGraph::Impl::serialize(int indent) const {
+    nlohmann::ordered_json j;
+    j["Nodes"] = nlohmann::json::array();
+    for (const auto &node : nodes_) {
+        j["Nodes"].emplace_back(this->to_json(node));
+    }
+    j["Tensors"] = nlohmann::json::array();
+    for (const auto &tensor_and_op : tensor_to_producer_op_) {
+        j["Tensors"].emplace_back(tensor_and_op.first->serialize());
+    }
+    return j.dump(indent);
+}
+
+std::vector<ModelNodeRef> ModelGraph::Impl::nodes() const {
+    std::vector<ModelNodeRef> vec;
+    vec.insert(vec.end(), nodes_.begin(), nodes_.end());
+    return vec;
+}
+
+ModelGraph::ModelGraph() : impl_(std::make_unique<ModelGraph::Impl>()) {}
+
+ModelGraph::ModelGraph(const ModelGraph &other)
+    : impl_(std::make_unique<ModelGraph::Impl>(*other.impl_)) {}
+
+ModelGraph::~ModelGraph() = default;
+
+ModelGraph &ModelGraph::operator=(const ModelGraph &other) {
+    *impl_ = *other.impl_;
+    return *this;
+}
+
+ModelNodeRef ModelGraph::break_node(ModelNodeRef node, size_t op_idx) {
+    return impl_->break_node(node, op_idx);
+}
+
+/// Get the list of @ref ModelNode in the graph.
+std::vector<ModelNodeRef> ModelGraph::nodes() const { return impl_->nodes(); }
+
+std::string ModelGraph::serialize(int indent) const {
+    return impl_->serialize(indent);
+}
+
+void ModelGraph::compress_nodes() { impl_->compress_nodes(); }
+
+bool ModelGraph::verify() const { return impl_->verify(); }
+
+}  // namespace ark
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
new file mode 100644
index 000000000..0f557ce39
--- /dev/null
+++ b/ark/model/model_graph_impl.hpp
@@ -0,0 +1,92 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_GRAPH_IMPL_HPP_
+#define ARK_MODEL_GRAPH_IMPL_HPP_
+
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+#include "ark/dims.hpp"
+#include "ark/model_graph.hpp"
+#include "model_op.hpp"
+#include "nlohmann/json.hpp"
+#include "unique_list.hpp"
+
+namespace ark {
+
+class ModelGraph::Impl {
+   public:
+    Impl(){};
+
+    Impl(const Impl &other);
+
+    Impl &operator=(const Impl &other);
+
+    template <typename T, typename... Args>
+    ModelOpRef create_op(const std::string &name, Args &&...args) {
+        ModelOpRef op = std::make_shared<T>(std::forward<Args>(args)...);
+        std::string name_copy;
+        if (name.empty()) {
+            name_copy = op->type()->type_name();
+        } else {
+            name_copy = name;
+        }
+        size_t count = op_names_.count(name_copy);
+        if (count > 0) {
+            name_copy += "_" + std::to_string(count);
+        }
+        op_names_.insert(name_copy);
+        op->set_name(name_copy);
+        add_op(op);
+        return op;
+    }
+
+    ModelNodeRef break_node(ModelNodeRef node, size_t op_idx);
+
+    void compress_nodes();
+
+    bool verify() const;
+
+    std::string serialize(int indent) const;
+
+    std::vector<ModelNodeRef> nodes() const;
+
+   private:
+    ModelNodeRef add_op(ModelOpRef op);
+
+    void remove_node(ModelNodeRef node);
+
+    bool depends_on(ModelNodeRef node1, ModelNodeRef node2) const;
+
+    void recursive_remove_virtual_nodes();
+
+    void recursive_remove_virtual_nodes(
+        UniqueList<ModelNodeRef> &seen_nodes,
+        const std::vector<ModelNodeRef> &boundary_nodes);
+
+    void recursive_merge_nodes();
+
+    void recursive_merge_nodes(UniqueList<ModelNodeRef> &seen_nodes,
+                               const std::vector<ModelNodeRef> &boundary_nodes);
+
+    nlohmann::ordered_json to_json(const ModelNodeRef &node) const;
+
+    /// The list of @ref ModelNode in the graph.
+    UniqueList<ModelNodeRef> nodes_;
+
+    /// The set of used names of @ref ModelOp.
+    std::multiset<std::string> op_names_;
+
+    /// The mapping from @ref ModelTensor to the @ref ModelOp that produces it.
+    std::map<ModelTensorRef, ModelOpRef> tensor_to_producer_op_;
+
+    /// The mapping from @ref ModelOp to the @ref ModelNode that contains it.
+    std::map<ModelOpRef, ModelNodeRef> op_to_node_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_GRAPH_IMPL_HPP_
diff --git a/ark/model/model_node.cpp b/ark/model/model_node.cpp
new file mode 100644
index 000000000..ef3a8158a
--- /dev/null
+++ b/ark/model/model_node.cpp
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_node.hpp"
+
+namespace ark {}  // namespace ark
diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp
new file mode 100644
index 000000000..7838ca120
--- /dev/null
+++ b/ark/model/model_node.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_NODE_HPP_
+#define ARK_MODEL_NODE_HPP_
+
+#include <memory>
+#include <vector>
+
+#include "ark/model_ref.hpp"
+#include "unique_list.hpp"
+
+namespace ark {
+
+/// A node of @ref Model.
+class ModelNode {
+   public:
+    ModelNode() = default;
+
+    /// The list of @ref Op that this @ref ModelNode contains. Sorted in the
+    /// execution order.
+    std::vector<ModelOpRef> ops;
+
+    /// The list of @ref ModelNode that depends on this @ref ModelNode.
+    UniqueList<ModelNodeRef> consumers;
+
+    /// The list of @ref ModelNode that this @ref ModelNode depends on.
+    UniqueList<ModelNodeRef> producers;
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_NODE_HPP_
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
new file mode 100644
index 000000000..c34ddadd5
--- /dev/null
+++ b/ark/model/model_op.cpp
@@ -0,0 +1,187 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_op.hpp"
+
+#include <algorithm>
+#include <set>
+
+#include "logging.h"
+#include "model_tensor.hpp"
+#include "ops/ops_arithmetic.hpp"
+#include "ops/ops_math.hpp"
+#include "ops/ops_matmul.hpp"
+#include "ops/ops_refer.hpp"
+#include "ops/ops_scale.hpp"
+#include "ops/ops_sendrecv.hpp"
+#include "ops/ops_tensor.hpp"
+
+namespace ark {
+
+std::shared_ptr<ModelOpFactory> model_op_factory() {
+    static auto factory = std::make_shared<ModelOpFactory>();
+    return factory;
+}
+
+#define MODEL_OP_TYPE_REGISTER(_name)                       \
+    instances[#_name] = std::make_shared<ModelOpT>(#_name); \
+    model_op_factory()->register_op<ModelOp##_name>(#_name);
+
+const ModelOpType ModelOpT::from_name(const std::string &type_name) {
+    static std::unordered_map<std::string, ModelOpType> instances;
+    if (instances.empty()) {
+        MODEL_OP_TYPE_REGISTER(Add);
+        MODEL_OP_TYPE_REGISTER(Div);
+        MODEL_OP_TYPE_REGISTER(Exp);
+        MODEL_OP_TYPE_REGISTER(Matmul);
+        MODEL_OP_TYPE_REGISTER(Mul);
+        MODEL_OP_TYPE_REGISTER(Recv);
+        MODEL_OP_TYPE_REGISTER(Relu);
+        MODEL_OP_TYPE_REGISTER(Scale);
+        MODEL_OP_TYPE_REGISTER(Send);
+        MODEL_OP_TYPE_REGISTER(SendDone);
+        MODEL_OP_TYPE_REGISTER(Sub);
+        MODEL_OP_TYPE_REGISTER(Tensor);
+    }
+    auto it = instances.find(type_name);
+    if (it == instances.end()) {
+        ERR(InvalidUsageError, "Unknown model op type: ", type_name);
+    }
+    return it->second;
+}
+
+std::vector<ModelTensorRef> ModelOp::input_tensors() const {
+    // input_tensors = read_tensors || write_tensors
+    std::set<ModelTensorRef> input_tensors;
+    input_tensors.insert(read_tensors_.begin(), read_tensors_.end());
+    input_tensors.insert(write_tensors_.begin(), write_tensors_.end());
+    std::vector<ModelTensorRef> input_tensors_vec(input_tensors.begin(),
+                                                  input_tensors.end());
+    return input_tensors_vec;
+}
+
+void ModelOp::verify() const {
+    std::set<ModelTensorRef> inputs;
+    inputs.insert(read_tensors_.begin(), read_tensors_.end());
+    inputs.insert(write_tensors_.begin(), write_tensors_.end());
+
+    for (auto &input : inputs) {
+        if (input->buffer() == nullptr) {
+            ERR(InvalidUsageError, "input tensor buffer is null");
+        }
+    }
+
+    std::set<ModelTensorRef> outputs;
+    outputs.insert(result_tensors_.begin(), result_tensors_.end());
+
+    for (auto &output : outputs) {
+        if (output->buffer() == nullptr) {
+            ERR(InvalidUsageError, "output tensor buffer is null");
+        }
+    }
+
+    std::set<ModelTensorRef> intersect;
+    std::set_intersection(inputs.begin(), inputs.end(), outputs.begin(),
+                          outputs.end(),
+                          std::inserter(intersect, intersect.begin()));
+    if (!intersect.empty()) {
+        ERR(InvalidUsageError, "cyclic dependency detected");
+    }
+}
+
+std::string ModelOp::vec_string(const Dims &dims) {
+    if (dims.is_invalid()) {
+        ERR(InvalidUsageError, "invalid dims given");
+    }
+    int ndims = dims.ndims();
+    std::stringstream ss;
+    ss << "Vec<";
+    if (ndims > 0) {
+        ss << dims[0];
+        for (int i = 1; i < ndims; ++i) {
+            ss << ", " << dims[i];
+        }
+    }
+    ss << '>';
+    return ss.str();
+}
+
+std::string ModelOp::function_name_string(
+    const std::string &kernel_name,
+    const std::vector<std::string> &template_args) {
+    std::stringstream ss;
+    ss << kernel_name;
+    if (!template_args.empty()) {
+        ss << "<" << template_args[0];
+        for (size_t i = 1; i < template_args.size(); i++) {
+            ss << ", " << template_args[i];
+        }
+        ss << ">";
+    }
+    return ss.str();
+}
+
+nlohmann::ordered_json ModelOp::serialize() const {
+    nlohmann::ordered_json j;
+    j["Type"] = type_->type_name();
+    j["Name"] = name_;
+    j["IsVirtual"] = is_virtual_;
+    j["ReadTensors"] = nlohmann::ordered_json::array();
+    for (auto &t : read_tensors_) {
+        j["ReadTensors"].push_back(t->serialize());
+    }
+    j["WriteTensors"] = nlohmann::ordered_json::array();
+    for (auto &t : write_tensors_) {
+        j["WriteTensors"].push_back(t->serialize());
+    }
+    j["ResultTensors"] = nlohmann::ordered_json::array();
+    for (auto &t : result_tensors_) {
+        j["ResultTensors"].push_back(t->serialize());
+    }
+    j["Args"] = nlohmann::ordered_json::object();
+    for (auto &arg : args_) {
+        j["Args"][arg.first] = arg.second.serialize();
+    }
+    return j;
+}
+
+std::shared_ptr<ModelOp> ModelOp::deserialize(
+    const nlohmann::json &serialized) {
+    if (!serialized.contains("Type")) {
+        ERR(InvalidUsageError, "ModelOp deserialization failed: missing Type");
+    } else if (!serialized.contains("Name")) {
+        ERR(InvalidUsageError, "ModelOp deserialization failed: missing Name");
+    } else if (!serialized.contains("IsVirtual")) {
+        ERR(InvalidUsageError,
+            "ModelOp deserialization failed: missing IsVirtual");
+    } else if (!serialized.contains("ReadTensors")) {
+        ERR(InvalidUsageError,
+            "ModelOp deserialization failed: missing ReadTensors");
+    } else if (!serialized.contains("WriteTensors")) {
+        ERR(InvalidUsageError,
+            "ModelOp deserialization failed: missing WriteTensors");
+    } else if (!serialized.contains("ResultTensors")) {
+        ERR(InvalidUsageError,
+            "ModelOp deserialization failed: missing ResultTensors");
+    } else if (!serialized.contains("Args")) {
+        ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args");
+    }
+    auto ret = model_op_factory()->construct(serialized["Type"]);
+    ret->name_ = serialized["Name"];
+    ret->is_virtual_ = serialized["IsVirtual"];
+    for (const auto &t : serialized["ReadTensors"]) {
+        ret->read_tensors_.push_back(ModelTensor::deserialize(t));
+    }
+    for (const auto &t : serialized["WriteTensors"]) {
+        ret->write_tensors_.push_back(ModelTensor::deserialize(t));
+    }
+    for (const auto &t : serialized["ResultTensors"]) {
+        ret->result_tensors_.push_back(ModelTensor::deserialize(t));
+    }
+    for (const auto &arg : serialized["Args"].items()) {
+        ret->args_[arg.key()] = ModelOpArg::deserialize(arg.value());
+    }
+    return ret;
+}
+
+}  // namespace ark
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
new file mode 100644
index 000000000..e6237361a
--- /dev/null
+++ b/ark/model/model_op.hpp
@@ -0,0 +1,140 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_OP_HPP_
+#define ARK_MODEL_OP_HPP_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "ark/model_ref.hpp"
+#include "logging.h"
+#include "model_op_arg.hpp"
+#include "model_op_config.hpp"
+#include "nlohmann/json.hpp"
+
+namespace ark {
+
+class ModelGraph;
+
+class ModelOpT;
+using ModelOpType = std::shared_ptr<ModelOpT>;
+
+class ModelOp;
+
+class ModelOpT : public NamedT {
+   public:
+    ModelOpT(const std::string &type_name) : NamedT(type_name) {}
+
+    ModelOpT(const ModelOpT &) = default;
+
+    static const ModelOpType from_name(const std::string &type_name);
+};
+
+class ModelOp {
+   public:
+    ModelOp() = default;
+
+    ModelOp(const std::string &type_name, bool is_virtual = false)
+        : type_(ModelOpT::from_name(type_name)), is_virtual_(is_virtual) {}
+
+    ModelOp(const ModelOp &) = default;
+
+    virtual std::string impl_name(
+        [[maybe_unused]] const nlohmann::json &config) const {
+        return "";
+    }
+
+    virtual std::vector<ModelOpArg> impl_args(
+        [[maybe_unused]] const nlohmann::json &config) const {
+        return {};
+    }
+
+    void set_name(const std::string &name) { name_ = name; }
+
+    ModelOpType type() const { return type_; }
+
+    const std::string &name() const { return name_; }
+
+    bool is_virtual() const { return is_virtual_; }
+
+    const std::vector<ModelTensorRef> &read_tensors() const {
+        return read_tensors_;
+    }
+
+    const std::vector<ModelTensorRef> &write_tensors() const {
+        return write_tensors_;
+    }
+
+    const std::vector<ModelTensorRef> &result_tensors() const {
+        return result_tensors_;
+    }
+
+    const std::map<std::string, ModelOpArg> &args() const { return args_; }
+
+    std::vector<ModelTensorRef> input_tensors() const;
+
+    void verify() const;
+
+    nlohmann::ordered_json serialize() const;
+
+    static std::shared_ptr<ModelOp> deserialize(
+        const nlohmann::json &serialized);
+
+   protected:
+    friend class ModelGraph;
+
+    static std::string vec_string(const Dims &dims);
+
+    static std::string function_name_string(
+        const std::string &kernel_name,
+        const std::vector<std::string> &template_args);
+
+    ModelOpType type_;
+    std::string name_;
+    bool is_virtual_;
+    std::vector<ModelTensorRef> read_tensors_;
+    std::vector<ModelTensorRef> write_tensors_;
+    std::vector<ModelTensorRef> result_tensors_;
+    std::map<std::string, ModelOpArg> args_;
+};
+
+class ModelOpFactory {
+   private:
+    std::unordered_map<std::string, std::function<std::shared_ptr<ModelOp>()>>
+        constructors_;
+
+   public:
+    ModelOpFactory() = default;
+
+    template <class DerivedModelOp>
+    void register_op(const std::string &class_name) {
+        if (constructors_.find(class_name) != constructors_.end()) {
+            ERR(InvalidUsageError, "Class already registered: ", class_name);
+        }
+        constructors_[class_name] = []() {
+            return std::shared_ptr<ModelOp>(new DerivedModelOp());
+        };
+    }
+
+    std::shared_ptr<ModelOp> construct(const std::string &class_name) const {
+        auto it = constructors_.find(class_name);
+        if (it == constructors_.end()) {
+            ERR(InvalidUsageError,
+                "Tried to construct an unknown class: ", class_name);
+        }
+        return it->second();
+    }
+
+    bool empty() const { return constructors_.empty(); }
+};
+
+std::shared_ptr<ModelOpFactory> model_op_factory();
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_OP_HPP_
diff --git a/ark/model/model_op_arg.cpp b/ark/model/model_op_arg.cpp
new file mode 100644
index 000000000..29b76d07c
--- /dev/null
+++ b/ark/model/model_op_arg.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_op_arg.hpp"
+
+#include "logging.h"
+#include "model_tensor.hpp"
+
+namespace ark {
+
+ModelOpArg::ModelOpArg() : NamedT("") {}
+
+nlohmann::ordered_json ModelOpArg::serialize() const {
+    const std::string &type_name = this->type_name();
+    nlohmann::ordered_json j;
+    if (type_name == "TENSOR") {
+        j[type_name] = this->value<ModelTensorRef>()->serialize();
+    } else if (type_name == "DIMS") {
+        j[type_name] = this->value<Dims>().vector();
+    } else if (type_name == "INT") {
+        j[type_name] = this->value<int>();
+    } else if (type_name == "INT64") {
+        j[type_name] = this->value<int64_t>();
+    } else if (type_name == "UINT64") {
+        j[type_name] = this->value<uint64_t>();
+    } else if (type_name == "BOOL") {
+        j[type_name] = this->value<bool>();
+    } else if (type_name == "FLOAT") {
+        j[type_name] = this->value<float>();
+    } else {
+        ERR(InvalidUsageError,
+            "Tried to serialize an unknown type of argument: ", type_name);
+    }
+    return j;
+}
+
+ModelOpArg ModelOpArg::deserialize(const nlohmann::json &serialized) {
+    const std::string &type_name = serialized[0];
+    auto &value = serialized[1];
+    if (type_name == "TENSOR") {
+        return ModelOpArg(ModelTensor::deserialize(value));
+    } else if (type_name == "DIMS") {
+        return ModelOpArg(Dims(value.get<std::vector<DimType>>()));
+    } else if (type_name == "INT") {
+        return ModelOpArg(value.get<int>());
+    } else if (type_name == "INT64") {
+        return ModelOpArg(value.get<int64_t>());
+    } else if (type_name == "UINT64") {
+        return ModelOpArg(value.get<uint64_t>());
+    } else if (type_name == "BOOL") {
+        return ModelOpArg(value.get<bool>());
+    } else if (type_name == "FLOAT") {
+        return ModelOpArg(value.get<float>());
+    }
+    ERR(InvalidUsageError,
+        "Tried to deserialize an unknown type of argument: ", type_name);
+    return ModelOpArg();
+}
+
+}  // namespace ark
diff --git a/ark/model/model_op_arg.hpp b/ark/model/model_op_arg.hpp
new file mode 100644
index 000000000..faf628ceb
--- /dev/null
+++ b/ark/model/model_op_arg.hpp
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_OP_ARG_HPP_
+#define ARK_MODEL_OP_ARG_HPP_
+
+#include <any>
+#include <ostream>
+#include <sstream>
+
+#include "ark/dims.hpp"
+#include "ark/model_ref.hpp"
+#include "named_type.hpp"
+#include "nlohmann/json.hpp"
+
+namespace ark {
+
+template <typename T>
+class ModelOpArgTName;
+
+#define REGISTER_MODEL_OP_ARG_TYPE(_name, _type)              \
+    template <>                                               \
+    class ModelOpArgTName<_type> {                            \
+       public:                                                \
+        ModelOpArgTName() : name(#_name), type_str(#_type){}; \
+        const std::string name;                               \
+        const std::string type_str;                           \
+    };
+
+class ModelOpArg : public NamedT {
+   public:
+    ModelOpArg();
+
+    template <typename T>
+    ModelOpArg(T val)
+        : NamedT(ModelOpArgTName<T>().name),
+          type_str_(ModelOpArgTName<T>().type_str),
+          val_(val) {}
+
+    template <typename T>
+    T value() const {
+        return std::any_cast<T>(val_);
+    }
+
+    const std::string &type_str() const { return type_str_; }
+
+    nlohmann::ordered_json serialize() const;
+
+    static ModelOpArg deserialize(const nlohmann::json &serialized);
+
+   private:
+    std::string type_str_;
+    std::any val_;
+};
+
+REGISTER_MODEL_OP_ARG_TYPE(INT, int)
+REGISTER_MODEL_OP_ARG_TYPE(INT64, int64_t)
+REGISTER_MODEL_OP_ARG_TYPE(UINT64, uint64_t)
+REGISTER_MODEL_OP_ARG_TYPE(BOOL, bool)
+REGISTER_MODEL_OP_ARG_TYPE(FLOAT, float)
+REGISTER_MODEL_OP_ARG_TYPE(DIMS, Dims)
+REGISTER_MODEL_OP_ARG_TYPE(TENSOR, ModelTensorRef)
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_OP_ARG_HPP_
diff --git a/ark/model/model_op_config.cpp b/ark/model/model_op_config.cpp
new file mode 100644
index 000000000..56cd7395f
--- /dev/null
+++ b/ark/model/model_op_config.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_op_config.hpp"
+
+namespace ark {
+
+ModelOpConfigArchT::ModelOpConfigArchT() : NamedT("ANY"){};
+
+ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0)
+    : NamedT(c0), category_({c0}) {}
+
+ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0,
+                                       const std::string &c1)
+    : NamedT(c0 + "_" + c1), category_({c0, c1}) {}
+
+ModelOpConfigArchT::ModelOpConfigArchT(const std::string &c0,
+                                       const std::string &c1,
+                                       const std::string &c2)
+    : NamedT(c0 + "_" + c1 + "_" + c2), category_({c0, c1, c2}) {}
+
+bool ModelOpConfigArchT::belongs_to(
+    const std::shared_ptr<ModelOpConfigArchT> arch) const {
+    if (category_.size() <= arch->category().size()) {
+        return false;
+    }
+    size_t idx = 0;
+    for (const auto &name : arch->category()) {
+        if (category_[idx++] != name) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool ModelOpConfigArchT::later_than(
+    const std::shared_ptr<ModelOpConfigArchT> arch) const {
+    if (category_.size() != arch->category().size()) {
+        return false;
+    }
+    size_t idx = 0;
+    for (const auto &name : arch->category()) {
+        if (category_[idx] != name) {
+            return category_[idx] > name;
+        }
+    }
+    return true;
+}
+
+extern const ModelOpConfigArchType ARCH_ANY =
+    std::make_shared<ModelOpConfigArchT>();
+
+extern const ModelOpConfigArchType ARCH_CUDA =
+    std::make_shared<ModelOpConfigArchT>("CUDA");
+extern const ModelOpConfigArchType ARCH_CUDA_70 =
+    std::make_shared<ModelOpConfigArchT>("CUDA", "70");
+extern const ModelOpConfigArchType ARCH_CUDA_80 =
+    std::make_shared<ModelOpConfigArchT>("CUDA", "80");
+extern const ModelOpConfigArchType ARCH_CUDA_90 =
+    std::make_shared<ModelOpConfigArchT>("CUDA", "90");
+
+extern const ModelOpConfigArchType ARCH_ROCM =
+    std::make_shared<ModelOpConfigArchT>("ROCM");
+extern const ModelOpConfigArchType ARCH_ROCM_90A =
+    std::make_shared<ModelOpConfigArchT>("ROCM", "90A");
+extern const ModelOpConfigArchType ARCH_ROCM_942 =
+    std::make_shared<ModelOpConfigArchT>("ROCM", "942");
+
+}  // namespace ark
diff --git a/ark/model/model_op_config.hpp b/ark/model/model_op_config.hpp
new file mode 100644
index 000000000..11e1a17d7
--- /dev/null
+++ b/ark/model/model_op_config.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_OP_CONFIG_HPP_
+#define ARK_MODEL_OP_CONFIG_HPP_
+
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "named_type.hpp"
+
+namespace ark {
+
+class ModelOpConfigArchT : public NamedT {
+   public:
+    ModelOpConfigArchT();
+
+    ModelOpConfigArchT(const std::string &c0);
+
+    ModelOpConfigArchT(const std::string &c0, const std::string &c1);
+
+    ModelOpConfigArchT(const std::string &c0, const std::string &c1,
+                       const std::string &c2);
+
+    ModelOpConfigArchT(const ModelOpConfigArchT &) = default;
+
+    const std::vector<std::string> &category() const { return category_; }
+
+    bool belongs_to(const std::shared_ptr<ModelOpConfigArchT> arch) const;
+
+    bool later_than(const std::shared_ptr<ModelOpConfigArchT> arch) const;
+
+   private:
+    std::vector<std::string> category_;
+};
+
+using ModelOpConfigArchType = std::shared_ptr<ModelOpConfigArchT>;
+
+extern const ModelOpConfigArchType ARCH_ANY;
+
+extern const ModelOpConfigArchType ARCH_CUDA;
+extern const ModelOpConfigArchType ARCH_CUDA_70;
+extern const ModelOpConfigArchType ARCH_CUDA_80;
+extern const ModelOpConfigArchType ARCH_CUDA_90;
+
+extern const ModelOpConfigArchType ARCH_ROCM;
+extern const ModelOpConfigArchType ARCH_ROCM_90A;
+extern const ModelOpConfigArchType ARCH_ROCM_942;
+
+class ModelOpConfig {
+   public:
+    ModelOpConfig(const ModelOpConfigArchType arch, const std::string &name,
+                  const std::string &impl_name)
+        : arch_(arch), name_(name), impl_name_(impl_name) {}
+
+    ModelOpConfig(const ModelOpConfig &) = default;
+
+    const ModelOpConfigArchType arch() const { return arch_; }
+
+    const std::string &name() const { return name_; }
+
+    const std::string &impl_name() const { return impl_name_; }
+
+   private:
+    ModelOpConfigArchType arch_;
+    std::string name_;
+    std::string impl_name_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_OP_CONFIG_HPP_
diff --git a/ark/model/model_tensor.cpp b/ark/model/model_tensor.cpp
new file mode 100644
index 000000000..04e575ffd
--- /dev/null
+++ b/ark/model/model_tensor.cpp
@@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "model_tensor.hpp"
+
+#include "logging.h"
+
+namespace ark {
+
+ModelBuffer::ModelBuffer() {
+    static size_t id = 0;
+    id_ = id++;
+}
+
+ModelBuffer::ModelBuffer(size_t id) { id_ = id; }
+
+ModelTensor::ModelTensor(ModelDataType data_type, ModelBufferRef buffer,
+                         const Dims &shape, const Dims &strides,
+                         const Dims &offsets, const Dims &pads, bool exported,
+                         int imported_rank)
+    : data_type_(data_type),
+      buffer_(buffer),
+      exported_(exported),
+      imported_rank_(imported_rank) {
+    if (shape.size() == 0) {
+        ERR(InvalidUsageError,
+            "Tensor shape should consist of positive numbers. Given: ", shape);
+    } else if (shape.is_no_dim()) {
+        // Assume a single-element constant
+        shape_ = {1};
+    } else {
+        shape_ = shape;
+    }
+    int ndims = shape_.ndims();
+    if (strides.is_no_dim()) {
+        strides_ = shape_;
+    } else {
+        if (ndims != strides.ndims()) {
+            ERR(InvalidUsageError,
+                "Tensor shapes and strides should have the same number of "
+                "dimensions. Given: shape ",
+                shape_, " strides ", strides);
+        }
+        strides_ = strides;
+    }
+    if (offsets.is_no_dim()) {
+        std::vector<DimType> dims_vec;
+        for (int i = 0; i < ndims; ++i) {
+            dims_vec.push_back(0);
+        }
+        offsets_ = Dims{dims_vec};
+    } else {
+        if (ndims != offsets.ndims()) {
+            ERR(InvalidUsageError,
+                "Tensor shape and offs should have the same number of "
+                "dimensions. Given: shape ",
+                shape_, " offs ", offsets);
+        }
+        offsets_ = offsets;
+    }
+    if (pads.is_no_dim()) {
+        std::vector<DimType> dims_vec;
+        for (int i = 0; i < ndims; ++i) {
+            dims_vec.push_back(1);
+        }
+        pads_ = Dims{dims_vec};
+    } else {
+        if (ndims != pads.ndims()) {
+            ERR(InvalidUsageError,
+                "Tensor shape and pads should have the same number of "
+                "dimensions. Given: shape ",
+                shape_, " pads ", pads);
+        }
+        pads_ = pads;
+    }
+    for (int i = 0; i < ndims; ++i) {
+        if (strides_[i] % pads_[i] != 0) {
+            ERR(InvalidUsageError,
+                "Tensor strides should be a multiple of pads. strides ",
+                strides_, " pads ", pads_);
+        }
+    }
+    for (int i = 0; i < ndims; ++i) {
+        if (offsets_[i] + shape_[i] > strides_[i]) {
+            ERR(InvalidUsageError, "Tensor exceeds the memory boundary. offs ",
+                offsets_, " shape ", shape_, " strides ", strides_);
+        }
+    }
+    id_ = next_id();
+}
+
+ModelTensor::ModelTensor(const ModelTensor &other) {
+    id_ = next_id();
+    data_type_ = other.data_type_;
+    buffer_ = other.buffer_;
+    shape_ = other.shape_;
+    strides_ = other.strides_;
+    offsets_ = other.offsets_;
+    pads_ = other.pads_;
+    exported_ = other.exported_;
+    imported_rank_ = other.imported_rank_;
+}
+
+bool ModelTensor::is_sequential() const {
+    // Shape and strides should be the same except for the first dimension.
+    for (int i = 1; i < shape_.ndims(); ++i) {
+        if (shape_[i] != strides_[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+nlohmann::ordered_json ModelTensor::serialize() const {
+    nlohmann::ordered_json j;
+    j["Id"] = id_;
+    j["DataType"] = data_type_->type_name();
+    j["BufferId"] = buffer_->id();
+    j["Shape"] = shape_.vector();
+    j["Strides"] = strides_.vector();
+    j["Offsets"] = offsets_.vector();
+    j["Pads"] = pads_.vector();
+    j["Exported"] = exported_;
+    j["ImportedRank"] = imported_rank_;
+    return j;
+}
+
+std::shared_ptr<ModelTensor> ModelTensor::deserialize(
+    const nlohmann::json &serialized) {
+    if (!serialized.contains("DataType")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing DataType");
+    } else if (!serialized.contains("BufferId")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing BufferId");
+    } else if (!serialized.contains("Shape")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Shape");
+    } else if (!serialized.contains("Strides")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Strides");
+    } else if (!serialized.contains("Offsets")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Offsets");
+    } else if (!serialized.contains("Pads")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Pads");
+    } else if (!serialized.contains("Exported")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Exported");
+    } else if (!serialized.contains("ImportedRank")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing ImportedRank");
+    } else if (!serialized.contains("Id")) {
+        ERR(InvalidUsageError,
+            "ModelTensor deserialization failed: missing Id");
+    }
+    auto ret = std::make_shared<ModelTensor>(
+        ModelDataT::from_name(serialized["DataType"]),
+        std::make_shared<ModelBuffer>(serialized["BufferId"]),
+        serialized["Shape"].get<std::vector<DimType>>(),
+        serialized["Strides"].get<std::vector<DimType>>(),
+        serialized["Offsets"].get<std::vector<DimType>>(),
+        serialized["Pads"].get<std::vector<DimType>>(),
+        serialized["Exported"].get<bool>(),
+        serialized["ImportedRank"].get<int>());
+    ret->id_ = serialized["Id"];
+    return ret;
+}
+
+size_t ModelTensor::next_id() {
+    static size_t id = 0;
+    return id++;
+}
+
+}  // namespace ark
diff --git a/ark/model/model_tensor.hpp b/ark/model/model_tensor.hpp
new file mode 100644
index 000000000..e0db15165
--- /dev/null
+++ b/ark/model/model_tensor.hpp
@@ -0,0 +1,92 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_TENSOR_HPP_
+#define ARK_MODEL_TENSOR_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model_ref.hpp"
+#include "model_data_type.hpp"
+#include "nlohmann/json.hpp"
+
+namespace ark {
+
+class ModelBuffer {
+   public:
+    ModelBuffer();
+
+    ModelBuffer(size_t id);
+
+    size_t id() const { return id_; }
+
+   private:
+    size_t id_;
+};
+
+/// Tensor is a view of a TensorBuf.
+///
+/// Illustration of a single axis of a tensor:
+///
+/// 0           off                                                       stride
+/// |------------|-------------shape-------------|---------------------------|
+///       ^       <----------------------------->                ^
+///       |          data range of this tensor                   |
+///       +------------------------------------------+-----------+
+///                                                  |
+///                                        We call these "padding".
+///
+class ModelTensor {
+   public:
+    ModelTensor(ModelDataType data_type, ModelBufferRef buffer,
+                const Dims &shape, const Dims &strides = {},
+                const Dims &offsets = {}, const Dims &pads = {},
+                bool exported = false, int imported_rank = -1);
+
+    ModelTensor(const ModelTensor &other);
+
+    size_t id() const { return id_; }
+
+    ModelDataType data_type() const { return data_type_; }
+
+    const ModelBufferRef buffer() const { return buffer_; }
+
+    const Dims &shape() const { return shape_; }
+
+    const Dims &strides() const { return strides_; }
+
+    const Dims &offsets() const { return offsets_; }
+
+    const Dims &pads() const { return pads_; }
+
+    bool exported() const { return exported_; }
+
+    int imported_rank() const { return imported_rank_; }
+
+    bool is_sequential() const;
+
+    void set_exported() { exported_ = true; }
+
+    void set_imported_rank(int rank) { imported_rank_ = rank; }
+
+    nlohmann::ordered_json serialize() const;
+
+    static std::shared_ptr<ModelTensor> deserialize(
+        const nlohmann::json &serialized);
+
+   private:
+    static size_t next_id();
+
+    size_t id_;
+    ModelDataType data_type_;
+    ModelBufferRef buffer_;
+    Dims shape_;
+    Dims strides_;
+    Dims offsets_;
+    Dims pads_;
+    bool exported_;
+    int imported_rank_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_MODEL_TENSOR_HPP_
diff --git a/ark/model/model_test.cpp b/ark/model/model_test.cpp
new file mode 100644
index 000000000..bebdd5cec
--- /dev/null
+++ b/ark/model/model_test.cpp
@@ -0,0 +1,443 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+
+#include <algorithm>
+
+#include "logging.h"
+#include "model_node.hpp"
+#include "model_op.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_model_basics() {
+    ark::Model model;
+    ark::Model compressed;
+
+    // Basic Test.
+    // Model graph:
+    //
+    //   TensorOp --> t0 --+--> AddOp --> t2
+    //                     |
+    //   TensorOp --> t1 --+
+    //                     |
+    //   TensorOp --> tx --+  (tx is the output reference, hidden from the code)
+    //
+
+    ark::ModelTensorRef t0 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t1 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t2 = model.add(t0, t1);
+
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+    UNITTEST_EQ(compressed.nodes().size(), 1);
+
+    auto node = compressed.nodes().front();
+    UNITTEST_EQ(node->ops.size(), 1);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1);
+    UNITTEST_EQ(node->consumers.size(), 0);
+    UNITTEST_EQ(node->producers.size(), 0);
+
+    // Test a chain of Ops that share an input tensor.
+    // Model graph:
+    //
+    // TensorOp --> t0 --+--> AddOp --> t2 ------+--> AddOp --> t3
+    //                   |                       |
+    // TensorOp --> t1 --+-----------------------+
+    //                   |                       |
+    // TensorOp --> tx --+     TensorOp --> ty --+
+    //
+    // (tx and ty are output references, hidden from the code)
+    //
+
+    ark::ModelTensorRef t3 = model.add(t2, t1);
+
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+    UNITTEST_EQ(compressed.nodes().size(), 1);
+
+    node = compressed.nodes().front();
+
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[1], t1);
+    UNITTEST_EQ(node->consumers.size(), 0);
+    UNITTEST_EQ(node->producers.size(), 0);
+
+    // Test a chain of Ops without shared input tensors.
+    // Model graph (omit leftmost part):
+    //
+    // ... ----+--> AddOp --> t3 ----+-> ReluOp --> t4
+    // ...     |                     |
+    // ... ----+   TensorOp --> tz --+
+    // ...     |
+    // ...   --+   (tz is the output reference, hidden from the code)
+    //
+
+    ark::ModelTensorRef t4 = model.relu(t3);
+
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,ReluOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+    UNITTEST_EQ(compressed.nodes().size(), 1);
+
+    node = compressed.nodes().front();
+
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], t0);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], t1);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[1], t1);
+    UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->ops[2]->read_tensors()[0], t3);
+    UNITTEST_EQ(node->consumers.size(), 0);
+    UNITTEST_EQ(node->producers.size(), 0);
+
+    // Test a chain of Ops that use the output from the same previous Op.
+    // Model graph (omit leftmost part):
+    //
+    // ...   +---- (this is t2) -------------------------+--> AddOp --> t5
+    // ...   |                                           |
+    // ... --+-+--> AddOp --> t3 ----+-> ReluOp --> t4 --+
+    // ...     |                     |                   |
+    // ... ----+   TensorOp --> tz --+                   |
+    // ...     |                       TensorOp --> tw --+
+    // ...   --+
+    //
+    // (tz and tw are output references, hidden from the code)
+    //
+
+    ark::ModelTensorRef t5 = model.add(t2, t4);
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,ReluOp,AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 1);
+
+    auto nodes_iter = nodes.begin();
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5);
+
+    // Test an Op that uses outputs from multiple previous Ops.
+    // Model graph (omit leftmost part):
+    //
+    // ... ----- (this is t2) --+--> AddOp --> t5
+    // ...                      |              |
+    // ... -+-> ReluOp --> t4 --+              |
+    // ...  |                   |              |
+    // ... -+                   |              |
+    // ...    TensorOp --> tw --+              |
+    // ...                                     |
+    //                                         |
+    //   TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9
+    //                     |
+    //   TensorOp --> t7 --+
+    //                     |
+    //   TensorOp --> tu --+
+    //
+    // (tw and tu are output references, hidden from the code)
+    //
+
+    ark::ModelTensorRef t6 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t7 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t8 = model.add(t6, t7);
+    ark::ModelTensorRef t9 = model.add(t5, t8);
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,ReluOp,AddOp,) --+
+    //                                 |
+    //                      (AddOp,) --+--> (AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+
+    nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 3);
+
+    nodes_iter = nodes.begin();
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_3;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_4;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9);
+
+    // Test an Op that uses a single input tensor for multiple inputs.
+    // Model graph (omit leftmost part):
+    //
+    // ... ----- (this is t2) --+--> AddOp --> t5
+    // ...                      |              |
+    // ... -+-> ReluOp --> t4 --+              |
+    // ...  |                   |              |
+    // ... -+                   |              |
+    // ...    TensorOp --> tw --+              |
+    // ...                                     |
+    //                                         |
+    //   TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9
+    //                     |
+    //   TensorOp --> t7 --+
+    //                     |
+    //   TensorOp --> tu --+
+    //
+    //   TensorOp --> t10 --+--> AddOp --> t11
+    //                      |    ^  ^
+    //                      |    |  |
+    //                      +----+  |
+    //                              |
+    //   TensorOp --> tv -----------+
+    //
+    // (tw, tu, and tv are output references, hidden from the code)
+    //
+
+    ark::ModelTensorRef t10 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t11 = model.add(t10, t10);
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,ReluOp,AddOp,) --+
+    //                                 |
+    //                      (AddOp,) --+--> (AddOp,)
+    //
+    //                                      (AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+
+    nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 4);
+
+    nodes_iter = nodes.begin();
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_3;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_4;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_5;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t11);
+
+    // Test using previous Ops' outputs from multiple different Ops.
+    // Model graph (omit leftmost part):
+    //
+    // ... ----- (this is t2) --+--> AddOp --> t5
+    // ...                      |              |
+    // ... -+-> ReluOp --> t4 --+              |
+    // ...  |                   |              |
+    // ... -+                   |              |
+    // ...    TensorOp --> tw --+              |
+    // ...                                     |
+    //                                         |
+    //   TensorOp --> t6 --+--> AddOp --> t8 --+--> AddOp --> t9
+    //                     |                   |
+    //   TensorOp --> t7 --+                   +--> AddOp --> t12
+    //                     |
+    //   TensorOp --> tu --+
+    //
+    //   TensorOp --> t10 --+--> AddOp --> t11
+    //                      |    ^  ^
+    //                      |    |  |
+    //                      +----+  |
+    //                              |
+    //   TensorOp --> tv -----------+
+    //
+    // (tw, tu, and tv are output references, hidden from the code)
+    //
+
+    ark::ModelTensorRef t12 = model.add(t5, t8);
+    UNITTEST_TRUE(model.verify());
+
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (AddOp,AddOp,ReluOp,AddOp,) --+--> (AddOp,)
+    //                                 |
+    //                      (AddOp,) --+--> (AddOp,)
+    //
+    //                                      (AddOp,)
+    //
+
+    compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+
+    nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 5);
+
+    nodes_iter = nodes.begin();
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add;add_1;relu;add_2;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t2);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], t3);
+    UNITTEST_EQ(node->ops[2]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->ops[3]->result_tensors()[0], t5);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_3;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t8);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_4;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t9);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_5;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t11);
+    node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "add_6;");
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t12);
+
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_model_dependent_inputs() {
+    ark::Model m;
+
+    ark::ModelTensorRef ones = m.tensor({256, 256}, ark::FP16);
+    ark::ModelTensorRef x0 = m.scale(m.scale(ones, 2), 2);
+    ark::ModelTensorRef x1 = m.scale(m.scale(x0, 2), 2);
+
+    ark::ModelTensorRef x2 = m.mul(ones, x1);
+    ark::ModelTensorRef x3 = m.mul(ones, x1);
+    ark::ModelTensorRef x4 = m.mul(x2, x3);
+    ark::ModelTensorRef y = m.add(x0, x4);
+
+    auto compressed = m.compress();
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 4);
+    auto nodes_iter = nodes.begin();
+    auto node = (nodes_iter++)->get();
+    UNITTEST_EQ(node->ops.size(), 4);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], x0);
+    UNITTEST_EQ(node->ops[3]->result_tensors()[0], x1);
+    UNITTEST_EQ(node->consumers.size(), 3);
+    UNITTEST_EQ(node->producers.size(), 0);
+    node = (nodes_iter++)->get();
+    UNITTEST_EQ(node->ops.size(), 1);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], x2);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], ones);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], x1);
+    UNITTEST_EQ(node->consumers.size(), 1);
+    UNITTEST_EQ(node->producers.size(), 1);
+    node = (nodes_iter++)->get();
+    UNITTEST_EQ(node->ops.size(), 1);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], x3);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], ones);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], x1);
+    UNITTEST_EQ(node->consumers.size(), 1);
+    UNITTEST_EQ(node->producers.size(), 1);
+    node = (nodes_iter++)->get();
+    UNITTEST_EQ(node->ops.size(), 2);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], x4);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[0], x2);
+    UNITTEST_EQ(node->ops[0]->read_tensors()[1], x3);
+    UNITTEST_EQ(node->ops[1]->result_tensors()[0], y);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[0], x0);
+    UNITTEST_EQ(node->ops[1]->read_tensors()[1], x4);
+    UNITTEST_EQ(node->consumers.size(), 0);
+    UNITTEST_EQ(node->producers.size(), 3);
+
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_model_noop() {
+    ark::Model model;
+    model.tensor({1}, ark::FP32);
+    model.tensor({1}, ark::FP32);
+    model.tensor({1}, ark::FP32);
+
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress();
+    UNITTEST_TRUE(compressed.verify());
+    UNITTEST_EQ(compressed.nodes().size(), 0);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_model_cumulate() {
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //       (Relu,) --+   (Relu,) --+
+    //                 |             |
+    //   (Relu,Add,) --+--> (Add,) --+--> (Add,)
+    //
+
+    ark::Model model;
+    ark::ModelTensorRef cumulate = model.tensor({1}, ark::FP32);
+
+    for (int i = 0; i < 3; ++i) {
+        ark::ModelTensorRef t = model.tensor({1}, ark::FP32);
+        ark::ModelTensorRef r = model.relu(t);
+        cumulate = model.add(cumulate, r);
+    }
+
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress();
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 5);
+
+    auto last_node = nodes.back().get();
+    UNITTEST_EQ(last_node->ops[0]->result_tensors()[0], cumulate);
+    UNITTEST_EQ(last_node->producers.size(), 2);
+    UNITTEST_EQ(last_node->consumers.size(), 0);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_model_basics);
+    UNITTEST(test_model_dependent_inputs);
+    UNITTEST(test_model_noop);
+    UNITTEST(test_model_cumulate);
+    return 0;
+}
diff --git a/ark/model/named_type.cpp b/ark/model/named_type.cpp
new file mode 100644
index 000000000..2f6bdb31a
--- /dev/null
+++ b/ark/model/named_type.cpp
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "named_type.hpp"
+
+namespace ark {
+
+bool operator==(const NamedT &lhs, const NamedT &rhs) {
+    return lhs.type_name() == rhs.type_name();
+}
+
+}  // namespace ark
diff --git a/ark/model/named_type.hpp b/ark/model/named_type.hpp
new file mode 100644
index 000000000..344cc6980
--- /dev/null
+++ b/ark/model/named_type.hpp
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_NAMED_TYPE_HPP_
+#define ARK_NAMED_TYPE_HPP_
+
+#include <string>
+
+namespace ark {
+
+class NamedT {
+   public:
+    NamedT(const std::string &type_name) : type_name_(type_name) {}
+    NamedT &operator=(const NamedT &) = default;
+
+    const std::string &type_name() const { return type_name_; }
+
+   private:
+    std::string type_name_;
+};
+
+bool operator==(const NamedT &lhs, const NamedT &rhs);
+
+}  // namespace ark
+
+#endif  // ARK_NAMED_TYPE_HPP_
diff --git a/ark/ops/ops_all_reduce.cpp b/ark/ops/ops_all_reduce.cpp
new file mode 100644
index 000000000..647c50d1e
--- /dev/null
+++ b/ark/ops/ops_all_reduce.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "math_utils.h"
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelTensorRef Model::all_reduce(ModelTensorRef input, int gpu_id, int gpu_num,
+                                 ModelTensorRef output, const std::string &) {
+    if (!input->is_sequential()) {
+        LOG(WARN,
+            "all_reduce may not work correctly if the input tensor is "
+            "not contiguous");
+    }
+    ModelTensorRef prev_recv;
+    ModelTensorRef cumulate = input;
+    for (int i = 1; i < gpu_num; i++) {
+        int gpu_dst = (gpu_id + i) % gpu_num;
+        int gpu_src = (gpu_id + gpu_num - i) % gpu_num;
+        ModelTensorRef send_data;
+        if (prev_recv) {
+            send_data = this->identity(input, {prev_recv});
+        } else {
+            send_data = input;
+        }
+        send_data = this->send(send_data, gpu_id, gpu_dst);
+        ModelTensorRef send_done_tensor =
+            this->send_done(send_data, gpu_id, gpu_dst);
+        ModelTensorRef recv_buf =
+            this->tensor(input->shape(), input->data_type());
+        recv_buf = this->identity(recv_buf, {send_done_tensor});
+        ModelTensorRef recv = this->recv(gpu_src, gpu_src, 0, recv_buf);
+        prev_recv = recv;
+        cumulate = this->add(cumulate, recv);
+    }
+    return cumulate;
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
new file mode 100644
index 000000000..26420d5ee
--- /dev/null
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+#include "logging.h"
+#include "model/model_node.hpp"
+#include "model/model_op.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_model_op_all_reduce() {
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //               +--> (S,SD,R,) --+--> (S,SD,R,) --+
+    //               |                |                |
+    //   (S,SD,R,) --+--> (Add,)      +--> (Add,)      +--> (Add,)
+    //                      |               ^  |              ^
+    //                      |               |  |              |
+    //                      +---------------+  +--------------+
+
+    ark::Model model;
+    ark::ModelTensorRef input = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef output = model.all_reduce(input, 0, 4);
+
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress();
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 6);
+
+    auto nodes_iter = nodes.begin();
+    auto node = *(nodes_iter++);
+    // UNITTEST_EQ(node->get_name(), "send;send_done;recv;");
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 2);
+
+    // UNITTEST_EQ(node->consumers[0]->get_name(), "add;");
+    UNITTEST_EQ(node->consumers[0]->consumers.size(), 1);
+    // UNITTEST_EQ((*(node->consumers[0]->consumers.begin()))->get_name(),
+    // "add_1;");
+
+    // UNITTEST_EQ(node->consumers[1]->get_name(),
+    // "send_1;send_done_1;recv_1;");
+    UNITTEST_EQ(node->consumers[1]->producers.size(), 1);
+    UNITTEST_EQ(node->consumers[1]->consumers.size(), 2);
+
+    node = node->consumers[1];
+
+    // UNITTEST_EQ(node->consumers[0]->get_name(), "add_1;");
+    UNITTEST_EQ(node->consumers[0]->producers.size(), 2);
+    UNITTEST_EQ(node->consumers[0]->consumers.size(), 1);
+    // UNITTEST_EQ((*(node->consumers[0]->consumers.begin()))->get_name(),
+    // "add_2;");
+
+    // UNITTEST_EQ(node->consumers[1]->get_name(),
+    // "send_2;send_done_2;recv_2;");
+    UNITTEST_EQ(node->consumers[1]->producers.size(), 1);
+    UNITTEST_EQ(node->consumers[1]->consumers.size(), 1);
+    // UNITTEST_EQ((*(node->consumers[1]->consumers.begin()))->get_name(),
+    // "add_2;");
+    UNITTEST_EQ(
+        (*(node->consumers[1]->consumers.begin()))->ops[0]->result_tensors()[0],
+        output);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_model_op_all_reduce);
+    return 0;
+}
diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp
new file mode 100644
index 000000000..992b60128
--- /dev/null
+++ b/ark/ops/ops_arithmetic.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_arithmetic.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpArithmetic::ModelOpArithmetic(const std::string &type_name,
+                                     ModelTensorRef input, ModelTensorRef other,
+                                     ModelTensorRef output)
+    : ModelOp(type_name) {
+    check_match_data_type(input, other);
+    if (output) {
+        check_match_data_type(input, output);
+    }
+    Dims output_shape = broadcast_shape(input->shape(), other->shape());
+    if (output) {
+        check_shape(output, output_shape);
+    } else {
+        output = std::make_shared<ModelTensor>(
+            input->data_type(), std::make_shared<ModelBuffer>(), output_shape);
+    }
+    ModelTensorRef result = std::make_shared<ModelTensor>(*output);
+
+    read_tensors_ = {input, other};
+    write_tensors_ = {output};
+    result_tensors_ = {result};
+
+    verify();
+}
+
+ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output)
+    : ModelOpArithmetic("Add", input, other, output) {}
+
+ModelTensorRef Model::add(ModelTensorRef input, ModelTensorRef other,
+                          ModelTensorRef output, const std::string &name) {
+    return impl_->create_op<ModelOpAdd>(name, input, other, output)
+        ->result_tensors()[0];
+}
+
+ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output)
+    : ModelOpArithmetic("Mul", input, other, output) {}
+
+ModelTensorRef Model::mul(ModelTensorRef input, ModelTensorRef other,
+                          ModelTensorRef output, const std::string &name) {
+    return impl_->create_op<ModelOpMul>(name, input, other, output)
+        ->result_tensors()[0];
+}
+
+ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output)
+    : ModelOpArithmetic("Sub", input, other, output) {}
+
+ModelTensorRef Model::sub(ModelTensorRef input, ModelTensorRef other,
+                          ModelTensorRef output, const std::string &name) {
+    return impl_->create_op<ModelOpSub>(name, input, other, output)
+        ->result_tensors()[0];
+}
+
+ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
+                       ModelTensorRef output)
+    : ModelOpArithmetic("Div", input, other, output) {}
+
+ModelTensorRef Model::div(ModelTensorRef input, ModelTensorRef other,
+                          ModelTensorRef output, const std::string &name) {
+    return impl_->create_op<ModelOpDiv>(name, input, other, output)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_arithmetic.hpp b/ark/ops/ops_arithmetic.hpp
new file mode 100644
index 000000000..ea5886381
--- /dev/null
+++ b/ark/ops/ops_arithmetic.hpp
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_ARITHMETIC_HPP_
+#define ARK_OPS_ARITHMETIC_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpArithmetic : public ModelOp {
+   public:
+    ModelOpArithmetic() = default;
+    ModelOpArithmetic(const std::string &type_name, ModelTensorRef input,
+                      ModelTensorRef other, ModelTensorRef output);
+};
+
+class ModelOpAdd : public ModelOpArithmetic {
+   public:
+    ModelOpAdd() = default;
+    ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
+               ModelTensorRef output);
+};
+
+class ModelOpMul : public ModelOpArithmetic {
+   public:
+    ModelOpMul() = default;
+    ModelOpMul(ModelTensorRef input, ModelTensorRef other,
+               ModelTensorRef output);
+};
+
+class ModelOpSub : public ModelOpArithmetic {
+   public:
+    ModelOpSub() = default;
+    ModelOpSub(ModelTensorRef input, ModelTensorRef other,
+               ModelTensorRef output);
+};
+
+class ModelOpDiv : public ModelOpArithmetic {
+   public:
+    ModelOpDiv() = default;
+    ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
+               ModelTensorRef output);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_ARITHMETIC_HPP_
diff --git a/ark/ops/ops_common.cpp b/ark/ops/ops_common.cpp
new file mode 100644
index 000000000..6cc269aae
--- /dev/null
+++ b/ark/ops/ops_common.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_common.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <ostream>
+
+#include "logging.h"
+
+namespace ark {
+
+void check_match_data_type(ModelTensorRef a, ModelTensorRef b) {
+    if (a->data_type() != b->data_type()) {
+        ERR(InvalidUsageError,
+            "data types mismatch: ", a->data_type()->type_name(), ", ",
+            b->data_type()->type_name());
+    }
+}
+
+void check_match_shape(ModelTensorRef a, ModelTensorRef b) {
+    if (a->shape() != b->shape()) {
+        ERR(InvalidUsageError, "shapes mismatch: ", a->shape(), ", ",
+            b->shape());
+    }
+}
+
+void check_shape(ModelTensorRef tensor, const Dims &shape) {
+    if (tensor->shape() != shape) {
+        ERR(InvalidUsageError, "shape mismatch: ", tensor->shape(), " and ",
+            shape);
+    }
+}
+
+Dims broadcast_shape(const Dims &dims1, const Dims &dims2) {
+    std::vector<DimType> output_dims_reversed;
+    int ndims = std::max(dims1.ndims(), dims2.ndims());
+    for (int i = 1; i < ndims + 1; ++i) {
+        int d1 = (i - 1 < dims1.ndims()) ? dims1[-i] : 1;
+        int d2 = (i - 1 < dims2.ndims()) ? dims2[-i] : 1;
+        if (d1 == d2) {
+            output_dims_reversed.push_back(d1);
+        } else if (d1 == 1) {
+            output_dims_reversed.push_back(d2);
+        } else if (d2 == 1) {
+            output_dims_reversed.push_back(d1);
+        } else {
+            ERR(InvalidUsageError,
+                "input and other cannot be broadcasted: ", dims1, ", ", dims2);
+        }
+    }
+    std::reverse(output_dims_reversed.begin(), output_dims_reversed.end());
+    return Dims{output_dims_reversed};
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_common.hpp b/ark/ops/ops_common.hpp
new file mode 100644
index 000000000..b6b9c6966
--- /dev/null
+++ b/ark/ops/ops_common.hpp
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_COMMON_HPP_
+#define ARK_OPS_COMMON_HPP_
+
+#include <memory>
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "logging.h"
+#include "model/model_graph_impl.hpp"
+#include "model/model_op.hpp"
+#include "model/model_tensor.hpp"
+
+namespace ark {
+
+void check_match_data_type(ModelTensorRef a, ModelTensorRef b);
+
+void check_match_shape(ModelTensorRef a, ModelTensorRef b);
+
+void check_shape(ModelTensorRef tensor, const Dims &shape);
+
+/// Return the output shape of broadcasting between two shapes.
+/// Follow NumPy rules.
+/// https://numpy.org/doc/stable/user/basics.broadcasting.html
+/// @param dims1 The first shape.
+/// @param dims2 The second shape.
+Dims broadcast_shape(const Dims &dims1, const Dims &dims2);
+
+}  // namespace ark
+
+#endif  // ARK_OPS_COMMON_HPP_
diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp
new file mode 100644
index 000000000..8871316b3
--- /dev/null
+++ b/ark/ops/ops_identity.cpp
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_identity.hpp"
+
+#include <set>
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpIdentity::ModelOpIdentity(ModelTensorRef input,
+                                 const std::vector<ModelTensorRef> &deps)
+    : ModelOpTensor(input->buffer(), input->shape(), input->data_type(),
+                    input->strides(), input->offsets(), input->pads(),
+                    input->exported(), input->imported_rank()) {
+    std::set<ModelTensorRef> dep_set;
+    dep_set.emplace(input);
+    read_tensors_.emplace_back(input);
+    for (auto &dep : deps) {
+        if (dep_set.emplace(dep).second) {
+            read_tensors_.emplace_back(dep);
+        }
+    }
+
+    verify();
+}
+
+ModelTensorRef Model::identity(ModelTensorRef input,
+                               const std::vector<ModelTensorRef> &deps,
+                               const std::string &name) {
+    return impl_->create_op<ModelOpIdentity>(name, input, deps)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_identity.hpp b/ark/ops/ops_identity.hpp
new file mode 100644
index 000000000..3bef04623
--- /dev/null
+++ b/ark/ops/ops_identity.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_IDENTITY_HPP_
+#define ARK_OPS_IDENTITY_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+#include "ops_tensor.hpp"
+
+namespace ark {
+
+class ModelOpIdentity : public ModelOpTensor {
+   public:
+    ModelOpIdentity() = default;
+    ModelOpIdentity(ModelTensorRef input,
+                    const std::vector<ModelTensorRef> &deps);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_IDENTITY_HPP_
diff --git a/ark/ops/ops_identity_test.cpp b/ark/ops/ops_identity_test.cpp
new file mode 100644
index 000000000..033092db8
--- /dev/null
+++ b/ark/ops/ops_identity_test.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+#include "logging.h"
+#include "model/model_node.hpp"
+#include "model/model_op.hpp"
+#include "model/model_tensor.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_model_op_identity() {
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (Relu,) --+
+    //             |
+    //   (Relu,) --+--> (Relu,)
+    //
+
+    ark::Model model;
+    ark::ModelTensorRef t0 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t1 = model.tensor({1}, ark::FP32);
+    ark::ModelTensorRef t2 = model.tensor({1}, ark::FP32);
+
+    ark::ModelTensorRef r0 = model.relu(t0);
+    ark::ModelTensorRef r1 = model.relu(t1);
+    ark::ModelTensorRef t3 = model.identity(t2, {r0, r1});
+
+    ark::ModelTensorRef t4 = model.relu(t3);
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress();
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 3);
+
+    auto nodes_iter = nodes.begin();
+    auto node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], r0);
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 1);
+
+    node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], r1);
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 1);
+
+    node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t4);
+    UNITTEST_EQ(node->producers.size(), 2);
+    UNITTEST_EQ(node->consumers.size(), 0);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_model_op_identity);
+    return 0;
+}
diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp
new file mode 100644
index 000000000..7340a8308
--- /dev/null
+++ b/ark/ops/ops_math.cpp
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_math.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input,
+                         ModelTensorRef output)
+    : ModelOp(type_name) {
+    if (output) {
+        check_match_data_type(input, output);
+        check_match_shape(input, output);
+    } else {
+        output = std::make_shared<ModelTensor>(input->data_type(),
+                                               std::make_shared<ModelBuffer>(),
+                                               input->shape());
+    }
+    ModelTensorRef result = std::make_shared<ModelTensor>(*output);
+
+    read_tensors_ = {input};
+    write_tensors_ = {output};
+    result_tensors_ = {result};
+
+    verify();
+}
+
+ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output)
+    : ModelOpMath("Exp", input, output) {}
+
+ModelTensorRef Model::exp(ModelTensorRef input, ModelTensorRef output,
+                          const std::string &name) {
+    return impl_->create_op<ModelOpExp>(name, input, output)
+        ->result_tensors()[0];
+}
+
+ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output)
+    : ModelOpMath("Relu", input, output) {}
+
+ModelTensorRef Model::relu(ModelTensorRef input, ModelTensorRef output,
+                           const std::string &name) {
+    return impl_->create_op<ModelOpRelu>(name, input, output)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_math.hpp b/ark/ops/ops_math.hpp
new file mode 100644
index 000000000..7f80b6e53
--- /dev/null
+++ b/ark/ops/ops_math.hpp
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_MATH_HPP_
+#define ARK_OPS_MATH_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpMath : public ModelOp {
+   public:
+    ModelOpMath() = default;
+    ModelOpMath(const std::string &type_name, ModelTensorRef input,
+                ModelTensorRef output);
+};
+
+class ModelOpExp : public ModelOpMath {
+   public:
+    ModelOpExp() = default;
+    ModelOpExp(ModelTensorRef input, ModelTensorRef output);
+};
+
+class ModelOpRelu : public ModelOpMath {
+   public:
+    ModelOpRelu() = default;
+    ModelOpRelu(ModelTensorRef input, ModelTensorRef output);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_MATH_HPP_
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
new file mode 100644
index 000000000..baeed3fac
--- /dev/null
+++ b/ark/ops/ops_matmul.cpp
@@ -0,0 +1,143 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_matmul.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpMatmul::ModelOpMatmul(ModelTensorRef input, ModelTensorRef other,
+                             ModelTensorRef output, bool trans_input,
+                             bool trans_other)
+    : ModelOp("Matmul") {
+    // Shape verification.
+    const Dims &shp_a = input->shape();
+    const Dims &shp_b = other->shape();
+    int ndims_a = shp_a.ndims();
+    int ndims_b = shp_b.ndims();
+
+    if (ndims_a < 1) {
+        ERR(InvalidUsageError, "input has an empty shape: ", shp_a);
+    }
+    if (ndims_b < 1) {
+        ERR(InvalidUsageError, "other has an empty shape: ", shp_b);
+    }
+
+    // m: the number of rows of output matrix (row-major)
+    // n: the number of columns of output matrix (row-major)
+    // k: the inner dimension of matrix multiplication
+    DimType m;
+    DimType n;
+    DimType k;
+    DimType k2;
+
+    m = (ndims_a == 1) ? 1 : shp_a[ndims_a - 2];
+    k = shp_a[ndims_a - 1];
+    if (trans_input) {
+        DimType tmp = m;
+        m = k;
+        k = tmp;
+    }
+    n = (ndims_b == 1) ? 1 : shp_b[ndims_b - 1];
+    k2 = (ndims_b == 1) ? shp_b[0] : shp_b[ndims_b - 2];
+    if (trans_other) {
+        DimType tmp = n;
+        n = k2;
+        k2 = tmp;
+    }
+    if (k != k2) {
+        ERR(InvalidUsageError, "inner dimensions mismatch: ", k, " and ", k2);
+    }
+
+    check_match_data_type(input, other);
+    if (output) {
+        check_match_data_type(input, output);
+    }
+
+    // N and C dimensions of matrix A
+    Dims nca{1, 1};
+    if (ndims_a == 4) {
+        nca[0] = shp_a[0];
+        nca[1] = shp_a[1];
+    } else if (ndims_a == 3) {
+        nca[1] = shp_a[0];
+    }
+
+    // N and C dimensions of matrix B
+    Dims ncb{1, 1};
+    if (ndims_b == 4) {
+        ncb[0] = shp_b[0];
+        ncb[1] = shp_b[1];
+    } else if (ndims_b == 3) {
+        ncb[1] = shp_b[0];
+    }
+
+    // Verify broadcasting
+    if (nca[0] != ncb[0] && nca[0] != 1 && ncb[0] != 1) {
+        ERR(InvalidUsageError, "N dimension mismatch: ", nca[0], " and ",
+            ncb[0]);
+    }
+    if (nca[1] != ncb[1] && nca[1] != 1 && ncb[1] != 1) {
+        ERR(InvalidUsageError, "C dimension mismatch: ", nca[1], " and ",
+            ncb[1]);
+    }
+
+    // N and C dimension of output matrix
+    Dims ncc{std::max(nca[0], ncb[0]), std::max(nca[1], ncb[1])};
+
+    Dims output_shape;
+    if (std::max(ndims_a, ndims_b) == 4) {
+        output_shape = Dims{ncc[0], ncc[1], m, n};
+    } else if (std::max(ndims_a, ndims_b) == 3) {
+        output_shape = Dims{ncc[1], m, n};
+    } else {
+        output_shape = Dims{m, n};
+    }
+
+    // Create an output Tensor.
+    if (output) {
+        check_shape(output, output_shape);
+    } else {
+        output = std::make_shared<ModelTensor>(
+            input->data_type(), std::make_shared<ModelBuffer>(), output_shape);
+    }
+    ModelTensorRef result = std::make_shared<ModelTensor>(*output);
+
+    const Dims &strides_a = input->strides();
+    const Dims &strides_b = other->strides();
+    const Dims &strides_y = output->strides();
+    // NOTE: `strides_mnk` here is just an expected value. We can
+    // calculate the exact value only after a specific implementation is
+    // determined.
+    Dims strides_mnk{
+        trans_input ? strides_a[ndims_a - 2] : strides_a[ndims_a - 1],
+        strides_y[strides_y.ndims() - 1], strides_y[strides_y.ndims() - 1],
+        trans_other ? strides_b[ndims_b - 2] : strides_b[ndims_b - 1]};
+
+    // a.k.a. problem size
+    Dims shapes_mnk{m, n, k};
+
+    read_tensors_ = {input, other};
+    write_tensors_ = {output};
+    result_tensors_ = {result};
+    args_["InputDimNC"] = nca;
+    args_["OtherDimNC"] = ncb;
+    args_["ShapesMNK"] = shapes_mnk;
+    args_["StridesMNK"] = strides_mnk;
+    args_["IsInputColumnMajor"] = trans_input;
+    args_["IsOtherColumnMajor"] = trans_other;
+
+    verify();
+}
+
+ModelTensorRef Model::matmul(ModelTensorRef input, ModelTensorRef other,
+                             ModelTensorRef output, bool trans_input,
+                             bool trans_other, const std::string &name) {
+    return impl_
+        ->create_op<ModelOpMatmul>(name, input, other, output, trans_input,
+                                   trans_other)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_matmul.hpp b/ark/ops/ops_matmul.hpp
new file mode 100644
index 000000000..f8b98ae56
--- /dev/null
+++ b/ark/ops/ops_matmul.hpp
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_MATMUL_HPP_
+#define ARK_OPS_MATMUL_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpMatmul : public ModelOp {
+   public:
+    ModelOpMatmul() = default;
+    ModelOpMatmul(ModelTensorRef input, ModelTensorRef other,
+                  ModelTensorRef output, bool trans_input, bool trans_other);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_MATMUL_HPP_
diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp
new file mode 100644
index 000000000..c78e1ed63
--- /dev/null
+++ b/ark/ops/ops_matmul_test.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <algorithm>
+
+#include "ark/model.hpp"
+#include "logging.h"
+#include "model/model_node.hpp"
+#include "model/model_op.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_model_op_matmul() {
+    // Hidden dimension of the dense layer.
+    unsigned int units = 1024;
+    // Input dimension of the dense layer.
+    unsigned int in_dim = 1024;
+    // Extra dimension of the input. CHANNEL=1 for 2D inputs.
+    unsigned int channel = 128;
+    // Batch size of the input.
+    unsigned int batch_size = 1;
+
+    ark::Model m;
+    ark::ModelTensorRef input =
+        m.tensor({batch_size, channel, in_dim}, ark::FP16);
+    ark::ModelTensorRef weight = m.tensor({in_dim, units}, ark::FP16);
+    m.matmul(input, weight);
+
+    UNITTEST_TRUE(m.verify());
+    auto compressed = m.compress();
+    UNITTEST_TRUE(compressed.verify());
+
+    return ark::unittest::SUCCESS;
+}
+
+// ark::unittest::State test_model_op_split_matmul() {
+//     // OpNode graph (parentheses indicate a OpNode):
+//     //
+//     //   (Matmul,) --+
+//     //               |
+//     //   (Matmul,) --+--> (Reduce,)
+//     //
+
+//     ark::Model model;
+//     ark::ModelTensorRef t0 = model.tensor({64, 128}, ark::FP16);
+//     ark::ModelTensorRef t1 = model.tensor({128, 64}, ark::FP16);
+//     model.matmul(t0, t1, nullptr, 2, false, false, "matmul", 3);
+//     UNITTEST_TRUE(model.verify());
+
+//     auto compressed = model.compress();
+//     auto nodes = compressed.nodes();
+//     UNITTEST_EQ(nodes.size(), 3);
+
+//     auto nodes_iter = nodes.begin();
+//     auto node = (nodes_iter++)->get();
+//     // UNITTEST_EQ(node->ops[0]->name, "matmul/matmul_shard_0");
+//     UNITTEST_EQ(node->producers.size(), 0);
+//     UNITTEST_EQ(node->consumers.size(), 1);
+
+//     node = (nodes_iter++)->get();
+//     // UNITTEST_EQ(node->ops[0]->name, "matmul/matmul_shard_1");
+//     UNITTEST_EQ(node->producers.size(), 0);
+//     UNITTEST_EQ(node->consumers.size(), 1);
+
+//     node = (nodes_iter++)->get();
+//     // UNITTEST_EQ(node->ops[0]->name, "matmul/reduce_sum");
+//     UNITTEST_EQ(node->producers.size(), 2);
+//     UNITTEST_EQ(node->consumers.size(), 0);
+
+//     return ark::unittest::SUCCESS;
+// }
+
+int main() {
+    UNITTEST(test_model_op_matmul);
+    // UNITTEST(test_model_op_split_matmul);
+    return 0;
+}
diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp
new file mode 100644
index 000000000..fca1f8566
--- /dev/null
+++ b/ark/ops/ops_refer.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_refer.hpp"
+
+#include <set>
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpRefer::ModelOpRefer(ModelTensorRef input, const Dims &shape,
+                           const Dims &strides, const Dims &offsets,
+                           const Dims &pads)
+    : ModelOpTensor(input->buffer(), shape, input->data_type(), strides,
+                    offsets, pads, input->exported(), input->imported_rank()) {
+    read_tensors_ = {input};
+    verify();
+}
+
+ModelTensorRef Model::refer(ModelTensorRef input, const Dims &shape,
+                            const Dims &strides, const Dims &offsets,
+                            const Dims &pads, const std::string &name) {
+    return impl_
+        ->create_op<ModelOpRefer>(name, input, shape, strides, offsets, pads)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_refer.hpp b/ark/ops/ops_refer.hpp
new file mode 100644
index 000000000..84d6ae362
--- /dev/null
+++ b/ark/ops/ops_refer.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_REFER_HPP_
+#define ARK_OPS_REFER_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+#include "ops_tensor.hpp"
+
+namespace ark {
+
+class ModelOpRefer : public ModelOpTensor {
+   public:
+    ModelOpRefer() = default;
+    ModelOpRefer(ModelTensorRef input, const Dims &shape, const Dims &strides,
+                 const Dims &offsets, const Dims &pads);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_REFER_HPP_
diff --git a/ark/ops/ops_scale.cpp b/ark/ops/ops_scale.cpp
new file mode 100644
index 000000000..cf4d29566
--- /dev/null
+++ b/ark/ops/ops_scale.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_scale.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpScale::ModelOpScale(ModelTensorRef input, float val,
+                           ModelTensorRef output)
+    : ModelOp("Scale") {
+    if (output) {
+        check_match_data_type(input, output);
+        check_match_shape(input, output);
+    } else {
+        output = std::make_shared<ModelTensor>(input->data_type(),
+                                               std::make_shared<ModelBuffer>(),
+                                               input->shape());
+    }
+    ModelTensorRef result = std::make_shared<ModelTensor>(*output);
+
+    read_tensors_ = {input};
+    write_tensors_ = {output};
+    result_tensors_ = {result};
+    args_ = {{"Factor", val}};
+
+    verify();
+}
+
+std::string ModelOpScale::impl_name(const nlohmann::json &config) const {
+    if (!config.contains("NumWarps")) {
+        ERR(InvalidUsageError, "NumWarps is required for Scale");
+    } else if (!config.contains("Tile")) {
+        ERR(InvalidUsageError, "Tile is required for Scale");
+    }
+    int num_warps = config["NumWarps"];
+    auto &tile_shape = config["Tile"];
+    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
+
+    std::vector<std::string> template_args;
+    template_args.emplace_back(vec_string(read_tensors_[0]->strides().dims4()));
+    template_args.emplace_back(vec_string(read_tensors_[0]->shape().dims4()));
+    template_args.emplace_back(
+        vec_string(write_tensors_[0]->strides().dims4()));
+    template_args.emplace_back(vec_string(write_tensors_[0]->shape().dims4()));
+    template_args.emplace_back(vec_string(unit_out_dims.dims4()));
+    template_args.emplace_back(std::to_string(num_warps));
+    template_args.emplace_back(std::to_string(0));
+    return function_name_string("scale", template_args);
+}
+
+std::vector<ModelOpArg> ModelOpScale::impl_args(
+    [[maybe_unused]] const nlohmann::json &config) const {
+    float factor = args_.at("Factor").value<float>();
+    std::vector<ModelOpArg> args;
+    args.emplace_back(result_tensors_[0]);
+    args.emplace_back(read_tensors_[0]);
+    args.emplace_back(factor);
+    return args;
+}
+
+ModelTensorRef Model::scale(ModelTensorRef input, float val,
+                            ModelTensorRef output, const std::string &name) {
+    return impl_->create_op<ModelOpScale>(name, input, val, output)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_scale.hpp b/ark/ops/ops_scale.hpp
new file mode 100644
index 000000000..937028b14
--- /dev/null
+++ b/ark/ops/ops_scale.hpp
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_SCALE_HPP_
+#define ARK_OPS_SCALE_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpScale : public ModelOp {
+   public:
+    ModelOpScale() = default;
+    ModelOpScale(ModelTensorRef input, float val, ModelTensorRef output);
+
+    std::string impl_name(const nlohmann::json &config) const override;
+
+    std::vector<ModelOpArg> impl_args(
+        [[maybe_unused]] const nlohmann::json &config) const override;
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_SCALE_HPP_
diff --git a/ark/ops/ops_sendrecv.cpp b/ark/ops/ops_sendrecv.cpp
new file mode 100644
index 000000000..4cff2d818
--- /dev/null
+++ b/ark/ops/ops_sendrecv.cpp
@@ -0,0 +1,100 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_sendrecv.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpSend::ModelOpSend(ModelTensorRef input, int sid, int rank, int dst_rank,
+                         DimType bytes)
+    : ModelOp("Send") {
+    DimType max_bytes = input->strides().size() * input->data_type()->bytes();
+    if (max_bytes < bytes) {
+        LOG(ERROR, "invalid bytes: ", bytes, ", max: ", max_bytes);
+    }
+    if (bytes == 0) {
+        bytes = max_bytes;
+    }
+    input->set_exported();
+
+    ModelTensorRef recvbuf = std::make_shared<ModelTensor>(
+        input->data_type(), std::make_shared<ModelBuffer>(), input->shape());
+    recvbuf->set_imported_rank(dst_rank);
+
+    ModelTensorRef result = std::make_shared<ModelTensor>(*recvbuf);
+
+    read_tensors_ = {input};
+    write_tensors_ = {recvbuf};
+    result_tensors_ = {result};
+    args_["Rank"] = rank;
+    args_["DstRank"] = dst_rank;
+    args_["Bytes"] = bytes;
+    args_["Sid"] = sid;
+
+    verify();
+}
+
+ModelTensorRef Model::send(ModelTensorRef input, int sid, int dst_rank,
+                           DimType bytes, const std::string &name) {
+    return impl_
+        ->create_op<ModelOpSend>(name, input, sid, rank_, dst_rank, bytes)
+        ->result_tensors()[0];
+}
+
+ModelOpSendDone::ModelOpSendDone(ModelTensorRef input, int rank, int dst_rank)
+    : ModelOp("SendDone") {
+    ModelTensorRef result = std::make_shared<ModelTensor>(*input);
+    read_tensors_ = {};
+    write_tensors_ = {input};
+    result_tensors_ = {result};
+    args_["Rank"] = rank;
+    args_["DstRank"] = dst_rank;
+
+    verify();
+}
+
+ModelTensorRef Model::send_done(ModelTensorRef input, int, int dst_rank,
+                                const std::string &name) {
+    return impl_->create_op<ModelOpSendDone>(name, input, rank_, dst_rank)
+        ->result_tensors()[0];
+}
+
+ModelOpRecv::ModelOpRecv(ModelTensorRef output, int, int rank, int src_rank,
+                         DimType bytes)
+    : ModelOp("Recv") {
+    if (output == nullptr) {
+        if (bytes == 0) {
+            LOG(ERROR, "receive bytes cannot be 0");
+        }
+        output = std::make_shared<ModelTensor>(
+            BYTE, std::make_shared<ModelBuffer>(), Dims{bytes});
+    }
+    output->set_exported();
+    DimType max_bytes = output->shape().size() * output->data_type()->bytes();
+    if (max_bytes < bytes) {
+        LOG(ERROR, "invalid bytes: ", bytes, ", max: ", max_bytes);
+    }
+    if (bytes == 0) {
+        bytes = max_bytes;
+    }
+    ModelTensorRef result = std::make_shared<ModelTensor>(*output);
+
+    read_tensors_ = {};
+    write_tensors_ = {output};
+    result_tensors_ = {result};
+    args_["Rank"] = rank;
+    args_["SrcRank"] = src_rank;
+
+    verify();
+}
+
+ModelTensorRef Model::recv(int sid, int src_rank, DimType bytes,
+                           ModelTensorRef output, const std::string &name) {
+    return impl_
+        ->create_op<ModelOpRecv>(name, output, sid, rank_, src_rank, bytes)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_sendrecv.hpp b/ark/ops/ops_sendrecv.hpp
new file mode 100644
index 000000000..5874f99fc
--- /dev/null
+++ b/ark/ops/ops_sendrecv.hpp
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_SENDRECV_HPP_
+#define ARK_OPS_SENDRECV_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpSend : public ModelOp {
+   public:
+    ModelOpSend() = default;
+    ModelOpSend(ModelTensorRef input, int sid, int rank, int dst_rank,
+                DimType bytes);
+};
+
+class ModelOpSendDone : public ModelOp {
+   public:
+    ModelOpSendDone() = default;
+    ModelOpSendDone(ModelTensorRef input, int rank, int dst_rank);
+};
+
+class ModelOpRecv : public ModelOp {
+   public:
+    ModelOpRecv() = default;
+    ModelOpRecv(ModelTensorRef output, int, int rank, int src_rank,
+                DimType bytes);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_SENDRECV_HPP_
diff --git a/ark/ops/ops_sharding.cpp b/ark/ops/ops_sharding.cpp
new file mode 100644
index 000000000..83f184555
--- /dev/null
+++ b/ark/ops/ops_sharding.cpp
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "math_utils.h"
+#include "ops_common.hpp"
+
+namespace ark {
+
+// Shard `input` along `axis` into `dim_per_shard`-dimensional shards.
+std::vector<ModelTensorRef> Model::sharding(ModelTensorRef input, DimType axis,
+                                            DimType dim_per_shard,
+                                            const std::string &name) {
+    if (axis >= DIMS_LEN) {
+        ERR(InvalidUsageError, "invlaid axis value: ", axis);
+    }
+    if ((input->shape()[axis] % dim_per_shard) != 0) {
+        // If the total dimension is not divided by the per-shard size,
+        // we need to check whether we can put a padding here.
+        // If the padded dimension of the input tensor is smaller than
+        // the leading dimension size, it means that the input tensor refers to
+        // a part of a buffer -- in this case, we cannot put a padding because
+        // the tensor has adjacent data.
+        DimType pdim = math::pad(input->shape()[axis], input->pads()[axis]);
+        if (pdim < input->strides()[axis]) {
+            ERR(InvalidUsageError, "the dimension of axis ", axis, " (",
+                input->shape()[axis],
+                ") is not divided by the dimension per shard (", dim_per_shard,
+                ") and this tensor cannot be padded.");
+        }
+    }
+    std::vector<ModelTensorRef> shards;
+    DimType num_shard = math::div_up(input->shape()[axis], dim_per_shard);
+    Dims shard_shape = input->shape();
+    Dims shard_offs = input->offsets();
+    Dims shard_pads = input->pads();
+    for (DimType i = 0; i < num_shard; ++i) {
+        DimType dim;
+        if (i == (num_shard - 1)) {
+            dim = input->shape()[axis] - (i * dim_per_shard);
+            shard_pads[axis] = input->pads()[axis];
+        } else {
+            dim = dim_per_shard;
+            shard_pads[axis] = 1;
+        }
+        shard_shape[axis] = dim;
+        ModelTensorRef shard =
+            this->refer(input, shard_shape, input->strides(), shard_offs,
+                        shard_pads, name + "/shard_" + std::to_string(i));
+        shards.emplace_back(shard);
+        shard_offs[axis] += dim;
+    }
+    return shards;
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_sharding_test.cpp b/ark/ops/ops_sharding_test.cpp
new file mode 100644
index 000000000..12ea6eca8
--- /dev/null
+++ b/ark/ops/ops_sharding_test.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+#include "logging.h"
+#include "model/model_node.hpp"
+#include "model/model_op.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_model_op_sharding() {
+    // OpNode graph (parentheses indicate a OpNode):
+    //
+    //   (Relu,) --+
+    //             |
+    //   (Relu,) --+
+    //             |
+    //   (Relu,) --+--> (Relu,)
+    //
+
+    ark::Model model;
+    ark::ModelTensorRef t0 = model.tensor({3}, ark::FP32);
+
+    std::vector<ark::ModelTensorRef> vec = model.sharding(t0, 0, 1);
+    UNITTEST_EQ(vec.size(), 3);
+
+    ark::ModelTensorRef t1 = vec[0];
+    ark::ModelTensorRef t2 = vec[1];
+    ark::ModelTensorRef t3 = vec[2];
+
+    ark::ModelTensorRef r0 = model.relu(t1);
+    ark::ModelTensorRef r1 = model.relu(t2);
+    ark::ModelTensorRef r2 = model.relu(t3);
+
+    ark::ModelTensorRef t4 = model.identity(t0, {r0, r1, r2});
+
+    ark::ModelTensorRef t5 = model.relu(t4);
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress();
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 4);
+
+    auto nodes_iter = nodes.begin();
+    auto node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], r0);
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 1);
+
+    node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], r1);
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 1);
+
+    node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], r2);
+    UNITTEST_EQ(node->producers.size(), 0);
+    UNITTEST_EQ(node->consumers.size(), 1);
+
+    node = *(nodes_iter++);
+    UNITTEST_EQ(node->ops[0]->result_tensors()[0], t5);
+    UNITTEST_EQ(node->producers.size(), 3);
+    UNITTEST_EQ(node->consumers.size(), 0);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_model_op_sharding);
+    return 0;
+}
diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp
new file mode 100644
index 000000000..6d9abce82
--- /dev/null
+++ b/ark/ops/ops_tensor.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_tensor.hpp"
+
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpTensor::ModelOpTensor(ModelBufferRef buffer, const Dims &shape,
+                             ModelDataType data_type, const Dims &strides,
+                             const Dims &offsets, const Dims &pads,
+                             bool exported, int imported_rank)
+    : ModelOp("Tensor", true) {
+    if (!buffer) {
+        buffer = std::make_shared<ModelBuffer>();
+    }
+
+    ModelTensorRef tensor =
+        std::make_shared<ModelTensor>(data_type, buffer, shape, strides,
+                                      offsets, pads, exported, imported_rank);
+
+    result_tensors_.emplace_back(tensor);
+
+    verify();
+}
+
+ModelTensorRef Model::tensor(const Dims &shape, ModelDataType data_type,
+                             const Dims &strides, const Dims &offsets,
+                             const Dims &pads, bool exported, int imported_rank,
+                             const std::string &name) {
+    return impl_
+        ->create_op<ModelOpTensor>(name, nullptr, shape, data_type, strides,
+                                   offsets, pads, exported, imported_rank)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_tensor.hpp b/ark/ops/ops_tensor.hpp
new file mode 100644
index 000000000..d575430f6
--- /dev/null
+++ b/ark/ops/ops_tensor.hpp
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_TENSOR_HPP_
+#define ARK_OPS_TENSOR_HPP_
+
+#include "ark/dims.hpp"
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpTensor : public ModelOp {
+   public:
+    ModelOpTensor() = default;
+    ModelOpTensor(ModelBufferRef buffer, const Dims &shape,
+                  ModelDataType data_type, const Dims &strides,
+                  const Dims &offsets, const Dims &pads, bool exported,
+                  int imported_rank);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_TENSOR_HPP_
diff --git a/ark/ops/ops_all_gather.cc b/ark/ops_old/ops_all_gather.cc
similarity index 89%
rename from ark/ops/ops_all_gather.cc
rename to ark/ops_old/ops_all_gather.cc
index dc5202258..ee49123e6 100644
--- a/ark/ops/ops_all_gather.cc
+++ b/ark/ops_old/ops_all_gather.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <cassert>
+#include <map>
 
 #include "logging.h"
 #include "math_utils.h"
@@ -70,8 +71,13 @@ GatherFromPeersOp::GatherFromPeersOp(const std::string &prec_type,
                                      int rank, int npeers, size_t stride,
                                      const std::string &name)
     : Op(OP_GATHER_FROM_PEERS, prec_type, remote_bufs,
-         {trans_region_local, local_buf}, {{rank, npeers, sid, stride}}, name,
-         &GatherFromPeersConfigMap, -1, true) {}
+         {trans_region_local, local_buf},
+         std::map<const std::string, ModelOpArgT>(
+             {{"rank", ModelOpArg<int>(rank)},
+              {"npeers", ModelOpArg<int>(npeers)},
+              {"sid", ModelOpArg<int>(sid)},
+              {"stride", ModelOpArg<size_t>(stride)}}),
+         name, &GatherFromPeersConfigMap, -1, true) {}
 
 std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const {
     Tensor *dst_buff = this->outputs[0];
@@ -80,7 +86,7 @@ std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const {
     int rank;
     int npeers;
     size_t stride;
-    this->args.get(&rank, 0);
+    this->args.find("rank")->second.get_value(&rank);
     this->args.get(&npeers, 1);
     this->args.get(&stride, 3);
 
@@ -100,7 +106,8 @@ std::string GatherFromPeersOp::function_name(const OpConfig &cfg) const {
                                npeers, rank, stride}});
 }
 
-OpArgs GatherFromPeersOp::function_call_args(const OpConfig &) const {
+std::vector<ModelOpArgT> GatherFromPeersOp::function_call_args(
+    const OpConfig &) const {
     int rank;
     int npeers;
     this->args.get(&rank, 0);
@@ -112,20 +119,20 @@ OpArgs GatherFromPeersOp::function_call_args(const OpConfig &) const {
 
     CHECK(local_buff->buf != nullptr);
 
-    OpArgs opargs;
+    std::vector<ModelOpArgT> opargs;
     // gether_from_peers(dst_offset, src_offset...)
-    opargs.put((size_t)(local_buff->buf->get_buf_offset() +
-                        local_buff->offset_bytes()));
+    opargs.emplace_back((size_t)(local_buff->buf->get_buf_offset() +
+                                 local_buff->offset_bytes()));
     for (int i = 0; i < MAX_PEER_NUM; i++) {
         if (i < npeers) {
             CHECK(remote_bufs[i]->buf != nullptr);
-            opargs.put((size_t)(remote_bufs[i]->buf->get_buf_offset() +
-                                remote_bufs[i]->offset_bytes()));
+            opargs.emplace_back((size_t)(remote_bufs[i]->buf->get_buf_offset() +
+                                         remote_bufs[i]->offset_bytes()));
         } else {
-            opargs.put((size_t)0);
+            opargs.emplace_back((size_t)0);
         }
     }
-    opargs.put(local_buff);
+    opargs.emplace_back(local_buff);
     return opargs;
 }
 
diff --git a/ark/ops/ops_all_gather_test.cc b/ark/ops_old/ops_all_gather_test.cc
similarity index 100%
rename from ark/ops/ops_all_gather_test.cc
rename to ark/ops_old/ops_all_gather_test.cc
diff --git a/ark/ops/ops_all_reduce.cc b/ark/ops_old/ops_all_reduce.cc
similarity index 100%
rename from ark/ops/ops_all_reduce.cc
rename to ark/ops_old/ops_all_reduce.cc
diff --git a/ark/ops/ops_all_reduce_test.cc b/ark/ops_old/ops_all_reduce_test.cc
similarity index 100%
rename from ark/ops/ops_all_reduce_test.cc
rename to ark/ops_old/ops_all_reduce_test.cc
diff --git a/ark/ops/ops_arithmetic.cc b/ark/ops_old/ops_arithmetic.cc
similarity index 100%
rename from ark/ops/ops_arithmetic.cc
rename to ark/ops_old/ops_arithmetic.cc
diff --git a/ark/ops/ops_arithmetic_test.cc b/ark/ops_old/ops_arithmetic_test.cc
similarity index 100%
rename from ark/ops/ops_arithmetic_test.cc
rename to ark/ops_old/ops_arithmetic_test.cc
diff --git a/ark/ops/ops_cast.cc b/ark/ops_old/ops_cast.cc
similarity index 100%
rename from ark/ops/ops_cast.cc
rename to ark/ops_old/ops_cast.cc
diff --git a/ark/ops/ops_cast_test.cc b/ark/ops_old/ops_cast_test.cc
similarity index 100%
rename from ark/ops/ops_cast_test.cc
rename to ark/ops_old/ops_cast_test.cc
diff --git a/ark/ops/ops_common.cc b/ark/ops_old/ops_common.cc
similarity index 100%
rename from ark/ops/ops_common.cc
rename to ark/ops_old/ops_common.cc
diff --git a/ark/ops/ops_common.h b/ark/ops_old/ops_common.h
similarity index 100%
rename from ark/ops/ops_common.h
rename to ark/ops_old/ops_common.h
diff --git a/ark/ops/ops_common_test.cc b/ark/ops_old/ops_common_test.cc
similarity index 100%
rename from ark/ops/ops_common_test.cc
rename to ark/ops_old/ops_common_test.cc
diff --git a/ark/ops/ops_copy.cc b/ark/ops_old/ops_copy.cc
similarity index 100%
rename from ark/ops/ops_copy.cc
rename to ark/ops_old/ops_copy.cc
diff --git a/ark/ops/ops_copy_test.cc b/ark/ops_old/ops_copy_test.cc
similarity index 100%
rename from ark/ops/ops_copy_test.cc
rename to ark/ops_old/ops_copy_test.cc
diff --git a/ark/ops/ops_device_sync.cc b/ark/ops_old/ops_device_sync.cc
similarity index 71%
rename from ark/ops/ops_device_sync.cc
rename to ark/ops_old/ops_device_sync.cc
index 71d808593..2c693199f 100644
--- a/ark/ops/ops_device_sync.cc
+++ b/ark/ops_old/ops_device_sync.cc
@@ -11,16 +11,27 @@ extern const OpConfigMap DeviceSyncConfigMap;
 
 DeviceSyncOp::DeviceSyncOp(const std::string &prec_type, Tensor *input,
                            Tensor *output, int nranks, const std::string &name)
-    : Op{OP_DEVICE_SYNC,       prec_type, {input}, {output}, {{nranks}}, name,
-         &DeviceSyncConfigMap, -1,        true} {}
+    : Op{OP_DEVICE_SYNC,
+         prec_type,
+         {input},
+         {output},
+         {{"nranks", ModelOpArg(nranks)}},
+         name,
+         &DeviceSyncConfigMap,
+         -1,
+         true} {}
 
 std::string DeviceSyncOp::function_name(const OpConfig &) const {
     int nranks;
-    this->args.get(&nranks, 0);
+    // this->args.get(&nranks, 0);
+    this->args["nranks"];
     return Op::function_name("ark::comm::device_sync", {{nranks}});
 }
 
-OpArgs DeviceSyncOp::function_call_args(const OpConfig &) const { return {}; }
+std::vector<ModelOpArgT> DeviceSyncOp::function_call_args(
+    const OpConfig &) const {
+    return {};
+}
 
 Tensor *Model::device_sync(Tensor *input, int nranks, const std::string &name) {
     DeviceSyncOp op{"none", input, input, nranks, name};
diff --git a/ark/ops/ops_embedding.cc b/ark/ops_old/ops_embedding.cc
similarity index 100%
rename from ark/ops/ops_embedding.cc
rename to ark/ops_old/ops_embedding.cc
diff --git a/ark/ops/ops_embedding_test.cc b/ark/ops_old/ops_embedding_test.cc
similarity index 100%
rename from ark/ops/ops_embedding_test.cc
rename to ark/ops_old/ops_embedding_test.cc
diff --git a/ark/ops/ops_identity.cc b/ark/ops_old/ops_identity.cc
similarity index 100%
rename from ark/ops/ops_identity.cc
rename to ark/ops_old/ops_identity.cc
diff --git a/ark/ops/ops_identity_test.cc b/ark/ops_old/ops_identity_test.cc
similarity index 100%
rename from ark/ops/ops_identity_test.cc
rename to ark/ops_old/ops_identity_test.cc
diff --git a/ark/ops/ops_im2col.cc b/ark/ops_old/ops_im2col.cc
similarity index 100%
rename from ark/ops/ops_im2col.cc
rename to ark/ops_old/ops_im2col.cc
diff --git a/ark/ops/ops_im2col_test.cc b/ark/ops_old/ops_im2col_test.cc
similarity index 100%
rename from ark/ops/ops_im2col_test.cc
rename to ark/ops_old/ops_im2col_test.cc
diff --git a/ark/ops/ops_layernorm.cc b/ark/ops_old/ops_layernorm.cc
similarity index 100%
rename from ark/ops/ops_layernorm.cc
rename to ark/ops_old/ops_layernorm.cc
diff --git a/ark/ops/ops_layernorm_test.cc b/ark/ops_old/ops_layernorm_test.cc
similarity index 100%
rename from ark/ops/ops_layernorm_test.cc
rename to ark/ops_old/ops_layernorm_test.cc
diff --git a/ark/ops/ops_math.cc b/ark/ops_old/ops_math.cc
similarity index 100%
rename from ark/ops/ops_math.cc
rename to ark/ops_old/ops_math.cc
diff --git a/ark/ops/ops_math_test.cc b/ark/ops_old/ops_math_test.cc
similarity index 100%
rename from ark/ops/ops_math_test.cc
rename to ark/ops_old/ops_math_test.cc
diff --git a/ark/ops/ops_matmul.cc b/ark/ops_old/ops_matmul.cc
similarity index 100%
rename from ark/ops/ops_matmul.cc
rename to ark/ops_old/ops_matmul.cc
diff --git a/ark/ops/ops_matmul_test.cu b/ark/ops_old/ops_matmul_test.cu
similarity index 100%
rename from ark/ops/ops_matmul_test.cu
rename to ark/ops_old/ops_matmul_test.cu
diff --git a/ark/ops/ops_max_pool.cc b/ark/ops_old/ops_max_pool.cc
similarity index 100%
rename from ark/ops/ops_max_pool.cc
rename to ark/ops_old/ops_max_pool.cc
diff --git a/ark/ops/ops_packet.cc b/ark/ops_old/ops_packet.cc
similarity index 100%
rename from ark/ops/ops_packet.cc
rename to ark/ops_old/ops_packet.cc
diff --git a/ark/ops/ops_reduce.cc b/ark/ops_old/ops_reduce.cc
similarity index 84%
rename from ark/ops/ops_reduce.cc
rename to ark/ops_old/ops_reduce.cc
index f75cf29e3..5be2e2ce3 100644
--- a/ark/ops/ops_reduce.cc
+++ b/ark/ops_old/ops_reduce.cc
@@ -57,18 +57,18 @@ std::string ReduceOp::function_name(const OpConfig &cfg,
         outshape.insert(axis, 1);
     }
 
-    Dims unit_out_dims{1, 1, tile_out.x, tile_out.y};
-    return Op::function_name("ark::reduce_" + type,
-                             {{
-                                 input->ldims.dims4(),  // InDims
-                                 input->shape.dims4(),  // InShape
-                                 outdims.dims4(),       // OutDims
-                                 outshape.dims4(),      // OutShape
-                                 unit_out_dims,         // UnitOutDims
-                                 cfg.num_warps,         // NumWarps
-                                 cfg.smem_bytes,        // SmemBytes
-                                 axis,                  // Axis
-                             }});
+    return Op::function_name(
+        "ark::reduce_" + type,
+        {{
+            {"InDims", input->ldims.dims4()},
+            {"InShape", input->shape.dims4()},
+            {"OutDims", outdims.dims4()},
+            {"OutShape", outshape.dims4()},
+            {"UnitOutDims", {1, 1, tile_out.x, tile_out.y}},
+            {"NumWarps", cfg.num_warps},
+            {"SmemBytes", cfg.smem_bytes},
+            {"Axis", axis},
+        }});
 }
 
 extern const OpConfigMap ReduceWConfigMap;
@@ -77,8 +77,14 @@ extern const OpConfigMap Broadcast1ConfigMap;
 ReduceWSumOp::ReduceWSumOp(const std::string &prec_type, Tensor *input,
                            Tensor *output, int axis, bool keepdims,
                            const std::string &name)
-    : ReduceOp{OP_REDUCE_W_SUM,    prec_type, {input},           {output},
-               {{axis, keepdims}}, name,      &ReduceWConfigMap, -1} {}
+    : ReduceOp{OP_REDUCE_W_SUM,
+               prec_type,
+               {input},
+               {output},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
+               name,
+               &ReduceWConfigMap,
+               -1} {}
 
 std::string ReduceWSumOp::function_name(const OpConfig &cfg) const {
     return ReduceOp::function_name(cfg, "w_sum");
@@ -91,7 +97,7 @@ ReduceESumOp::ReduceESumOp(const std::string &prec_type, Tensor *input,
                prec_type,
                {input},
                {output},
-               {{axis, keepdims}},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
                name,
                &Broadcast1ConfigMap,
                -1} {}
@@ -103,8 +109,14 @@ std::string ReduceESumOp::function_name(const OpConfig &cfg) const {
 ReduceWMaxOp::ReduceWMaxOp(const std::string &prec_type, Tensor *input,
                            Tensor *output, int axis, bool keepdims,
                            const std::string &name)
-    : ReduceOp{OP_REDUCE_W_MAX,    prec_type, {input},           {output},
-               {{axis, keepdims}}, name,      &ReduceWConfigMap, -1} {}
+    : ReduceOp{OP_REDUCE_W_MAX,
+               prec_type,
+               {input},
+               {output},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
+               name,
+               &ReduceWConfigMap,
+               -1} {}
 
 std::string ReduceWMaxOp::function_name(const OpConfig &cfg) const {
     return ReduceOp::function_name(cfg, "w_max");
@@ -117,7 +129,7 @@ ReduceEMaxOp::ReduceEMaxOp(const std::string &prec_type, Tensor *input,
                prec_type,
                {input},
                {output},
-               {{axis, keepdims}},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
                name,
                &Broadcast1ConfigMap,
                -1} {}
@@ -129,8 +141,14 @@ std::string ReduceEMaxOp::function_name(const OpConfig &cfg) const {
 ReduceWMeanOp::ReduceWMeanOp(const std::string &prec_type, Tensor *input,
                              Tensor *output, int axis, bool keepdims,
                              const std::string &name)
-    : ReduceOp{OP_REDUCE_W_MEAN,   prec_type, {input},           {output},
-               {{axis, keepdims}}, name,      &ReduceWConfigMap, -1} {}
+    : ReduceOp{OP_REDUCE_W_MEAN,
+               prec_type,
+               {input},
+               {output},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
+               name,
+               &ReduceWConfigMap,
+               -1} {}
 
 std::string ReduceWMeanOp::function_name(const OpConfig &cfg) const {
     return ReduceOp::function_name(cfg, "w_mean");
@@ -143,7 +161,7 @@ ReduceEMeanOp::ReduceEMeanOp(const std::string &prec_type, Tensor *input,
                prec_type,
                {input},
                {output},
-               {{axis, keepdims}},
+               {{{"axis", axis}, {"keepdims", keepdims}}},
                name,
                &Broadcast1ConfigMap,
                -1} {}
diff --git a/ark/ops/ops_reduce_scatter.cc b/ark/ops_old/ops_reduce_scatter.cc
similarity index 90%
rename from ark/ops/ops_reduce_scatter.cc
rename to ark/ops_old/ops_reduce_scatter.cc
index 59fa10145..a4c32d88d 100644
--- a/ark/ops/ops_reduce_scatter.cc
+++ b/ark/ops_old/ops_reduce_scatter.cc
@@ -18,7 +18,12 @@ ReadAndReduceOp::ReadAndReduceOp(const std::string &prec_type,
                                  int rank, int npeers, size_t offset,
                                  size_t bytes, const std::string &name)
     : Op(OP_READ_AND_REDUCE, prec_type, {local_buf},
-         {cal_region_local, local_buf}, {{rank, npeers, sid, offset, bytes}},
+         {cal_region_local, local_buf},
+         {{{"rank", rank},
+           {"npeers", npeers},
+           {"sid", sid},
+           {"offset", offset},
+           {"bytes", bytes}}},
          name, &ReadAndReduceConfigMap, -1, true) {
     this->inputs.insert(this->inputs.end(), remote_bufs.begin(),
                         remote_bufs.end());
@@ -32,10 +37,10 @@ std::string ReadAndReduceOp::function_name(const OpConfig &cfg) const {
     int peer_rank;
     size_t offset;
     size_t bytes;
-    this->args.get(&rank, 0);
-    this->args.get(&peer_rank, 1);
-    this->args.get(&offset, 3);
-    this->args.get(&bytes, 4);
+    this->args.get("rank", &rank);
+    this->args.get("npeers", &peer_rank);
+    this->args.get("offset", &offset);
+    this->args.get("bytes", &bytes);
 
     const OpTile &tile_out = cfg.output_tiles[0];
     size_t neles_per_tile = tile_out.x * tile_out.y > dst_buff->shape.size()
@@ -69,15 +74,16 @@ OpArgs ReadAndReduceOp::function_call_args(const OpConfig &) const {
     OpArgs opargs;
     // read_and_redcue(src_offset...)
     for (int i = 0; i < get_env().num_ranks_per_host - 1; i++) {
+        auto name = "src_offset_" + std::to_string(i);
         if (i < npeers) {
             CHECK(remote_bufs[i]->buf != nullptr);
-            opargs.put((size_t)(remote_bufs[i]->buf->get_buf_offset() +
-                                remote_bufs[i]->offset_bytes()));
+            opargs.put(name, size_t{remote_bufs[i]->buf->get_buf_offset() +
+                                    remote_bufs[i]->offset_bytes()});
         } else {
-            opargs.put((size_t)0);
+            opargs.put(name, size_t{0});
         }
     }
-    opargs.put(local_buff);
+    opargs.put("src", local_buff);
     return opargs;
 }
 
diff --git a/ark/ops/ops_reduce_scatter_test.cc b/ark/ops_old/ops_reduce_scatter_test.cc
similarity index 100%
rename from ark/ops/ops_reduce_scatter_test.cc
rename to ark/ops_old/ops_reduce_scatter_test.cc
diff --git a/ark/ops/ops_reduce_test.cc b/ark/ops_old/ops_reduce_test.cc
similarity index 100%
rename from ark/ops/ops_reduce_test.cc
rename to ark/ops_old/ops_reduce_test.cc
diff --git a/ark/ops/ops_reshape.cc b/ark/ops_old/ops_reshape.cc
similarity index 100%
rename from ark/ops/ops_reshape.cc
rename to ark/ops_old/ops_reshape.cc
diff --git a/ark/ops/ops_reshape_test.cc b/ark/ops_old/ops_reshape_test.cc
similarity index 100%
rename from ark/ops/ops_reshape_test.cc
rename to ark/ops_old/ops_reshape_test.cc
diff --git a/ark/ops/ops_rope.cc b/ark/ops_old/ops_rope.cc
similarity index 100%
rename from ark/ops/ops_rope.cc
rename to ark/ops_old/ops_rope.cc
diff --git a/ark/ops/ops_rope_test.cc b/ark/ops_old/ops_rope_test.cc
similarity index 100%
rename from ark/ops/ops_rope_test.cc
rename to ark/ops_old/ops_rope_test.cc
diff --git a/ark/ops/ops_scale.cc b/ark/ops_old/ops_scale.cc
similarity index 98%
rename from ark/ops/ops_scale.cc
rename to ark/ops_old/ops_scale.cc
index 4c3a3a956..6d2693ee7 100644
--- a/ark/ops/ops_scale.cc
+++ b/ark/ops_old/ops_scale.cc
@@ -16,7 +16,7 @@ ScaleOp::ScaleOp(const std::string &prec_type, Tensor *input, Tensor *output,
          prec_type,
          {input},
          {output},
-         {{val}},
+         {{{"factor", val}}},
          name,
          &Broadcast1ConfigMap,
          -1,
diff --git a/ark/ops/ops_scale_test.cc b/ark/ops_old/ops_scale_test.cc
similarity index 100%
rename from ark/ops/ops_scale_test.cc
rename to ark/ops_old/ops_scale_test.cc
diff --git a/ark/ops/ops_sendrecv.cc b/ark/ops_old/ops_sendrecv.cc
similarity index 100%
rename from ark/ops/ops_sendrecv.cc
rename to ark/ops_old/ops_sendrecv.cc
diff --git a/ark/ops/ops_sendrecv_test.cc b/ark/ops_old/ops_sendrecv_test.cc
similarity index 100%
rename from ark/ops/ops_sendrecv_test.cc
rename to ark/ops_old/ops_sendrecv_test.cc
diff --git a/ark/ops/ops_sharding.cc b/ark/ops_old/ops_sharding.cc
similarity index 100%
rename from ark/ops/ops_sharding.cc
rename to ark/ops_old/ops_sharding.cc
diff --git a/ark/ops/ops_tensor.cc b/ark/ops_old/ops_tensor.cc
similarity index 100%
rename from ark/ops/ops_tensor.cc
rename to ark/ops_old/ops_tensor.cc
diff --git a/ark/ops/ops_tensor_test.cc b/ark/ops_old/ops_tensor_test.cc
similarity index 100%
rename from ark/ops/ops_tensor_test.cc
rename to ark/ops_old/ops_tensor_test.cc
diff --git a/ark/ops/ops_test_common.cc b/ark/ops_old/ops_test_common.cc
similarity index 100%
rename from ark/ops/ops_test_common.cc
rename to ark/ops_old/ops_test_common.cc
diff --git a/ark/ops/ops_test_common.h b/ark/ops_old/ops_test_common.h
similarity index 100%
rename from ark/ops/ops_test_common.h
rename to ark/ops_old/ops_test_common.h
diff --git a/ark/ops/ops_transpose.cc b/ark/ops_old/ops_transpose.cc
similarity index 100%
rename from ark/ops/ops_transpose.cc
rename to ark/ops_old/ops_transpose.cc
diff --git a/ark/ops/ops_transpose_test.cc b/ark/ops_old/ops_transpose_test.cc
similarity index 100%
rename from ark/ops/ops_transpose_test.cc
rename to ark/ops_old/ops_transpose_test.cc
diff --git a/ark/random.cc b/ark/random.cpp
similarity index 97%
rename from ark/random.cc
rename to ark/random.cpp
index ffa9d2f4a..c4282377f 100644
--- a/ark/random.cc
+++ b/ark/random.cpp
@@ -9,7 +9,7 @@
 #include <random>
 #define gettid() syscall(SYS_gettid)
 
-#include "include/ark.h"
+#include "ark/random.hpp"
 
 namespace ark {
 
diff --git a/ark/random.h b/ark/random.h
index 6f237d57c..bfa7525d3 100644
--- a/ark/random.h
+++ b/ark/random.h
@@ -6,13 +6,6 @@
 
 namespace ark {
 
-/// Generate a random value.
-template <typename T>
-T rand(float min_val, float max_val) {
-    int mid = RAND_MAX / 2;
-    return T((ark::rand() - mid) / (float)mid * (max_val - min_val) + min_val);
-}
-
 /// Generate a random alpha-numeric string.
 /// @param len Length of the string
 /// @return A random alpha-numeric string
diff --git a/ark/range.h b/ark/range.hpp
similarity index 80%
rename from ark/range.h
rename to ark/range.hpp
index 8788d720a..95f085e49 100644
--- a/ark/range.h
+++ b/ark/range.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_RANGE_H_
-#define ARK_RANGE_H_
+#ifndef ARK_RANGE_HPP_
+#define ARK_RANGE_HPP_
 
 namespace ark {
 
@@ -51,12 +51,24 @@ class Range {
         T step_;
     };
 
+    bool operator==(const Range &other) const {
+        return begin_ == other.begin_ && end_ == other.end_ &&
+               step_ == other.step_;
+    }
+
+    bool operator<(const Range &other) const {
+        return begin_ < other.begin_ || end_ < other.end_ ||
+               step_ < other.step_;
+    }
+
     Iterator begin() const { return Iterator(begin_, begin_, end_, step_); }
 
     Iterator end() const { return Iterator(end_, begin_, end_, step_); }
 
     T step() const { return step_; }
 
+    T size() const { return (end_ - begin_) / step_; }
+
    private:
     T begin_;
     T end_;
@@ -80,4 +92,4 @@ Range<T> range(T begin, T end, T step) {
 
 }  // namespace ark
 
-#endif  // ARK_RANGE_H_
+#endif  // ARK_RANGE_HPP_
diff --git a/ark/range_test.cc b/ark/range_test.cpp
similarity index 96%
rename from ark/range_test.cc
rename to ark/range_test.cpp
index a170cc699..32dcb73db 100644
--- a/ark/range_test.cc
+++ b/ark/range_test.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "range.h"
+#include "range.hpp"
 
 #include "unittest/unittest_utils.h"
 
@@ -50,7 +50,6 @@ ark::unittest::State test_range() {
 }
 
 int main() {
-    ark::init();
     UNITTEST(test_range);
     return 0;
 }
diff --git a/ark/sched/sched.h b/ark/sched/sched.h
index eeab05ae3..f4048a8bf 100644
--- a/ark/sched/sched.h
+++ b/ark/sched/sched.h
@@ -4,10 +4,9 @@
 #ifndef ARK_SCHED_H_
 #define ARK_SCHED_H_
 
-#include "include/ark.h"
+#include "ark/schedule.hpp"
 #include "sched/sched_codegen.h"
 #include "sched/sched_stream.h"
-#include "schedule/schedule.h"
 
 namespace ark {
 
diff --git a/ark/sched/sched_codegen.cc b/ark/sched/sched_codegen.cc
deleted file mode 100644
index 734ab8991..000000000
--- a/ark/sched/sched_codegen.cc
+++ /dev/null
@@ -1,423 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "sched/sched_codegen.h"
-
-#include <unistd.h>
-
-#include <cassert>
-#include <fstream>
-#include <initializer_list>
-#include <ostream>
-
-#include "env.h"
-#include "logging.h"
-#include "math_utils.h"
-
-#define OP_PREFIX "op"
-#define UNIT_OP_PREFIX "uop"
-#define ALLOW_FOR_LOOP 1
-
-namespace ark {
-
-CodeGenerator::CodeGenerator(const GpuManager::Info &gpu_info_,
-                             int num_warps_per_sm_)
-    : gpu_info{gpu_info_},
-      sm_num{gpu_info_.num_sm},
-      num_warps_per_sm{num_warps_per_sm_},
-      num_indent{0} {}
-
-size_t CodeGenerator::get_tensor_offset(const Tensor *tensor) const {
-    size_t off = tensor->buf->get_buf_offset();
-    assert(off % 8 == 0);
-    return off + tensor->offset_bytes();
-}
-
-std::ostream &CodeGenerator::def_remote_buf(std::ostream &os,
-                                            int remote_rank) const {
-    os << "__device__ char *" ARK_BUF_NAME << remote_rank << ";\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::sync_gpu(std::ostream &os) const {
-    os << "ark::sync_gpu<" << this->sm_num << ">(" ARK_LSS_NAME ");\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::def_sync_stream(std::ostream &os,
-                                             int stream_id) const {
-    os << "__device__ ark::sync::State " ARK_LSS_NAME "_" << stream_id << ";\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::sync_stream(std::ostream &os, int stream_id,
-                                         int sm_id_begin, int sm_id_end) const {
-    if (sm_id_begin >= sm_id_end) {
-        ERR(SchedulerError, "invalid SM range");
-    }
-    if (sm_id_begin == 0) {
-        os << "if (blockIdx.x < " << sm_id_end << ") {";
-    } else if (sm_id_begin + 1 == sm_id_end) {
-        os << "if (blockIdx.x == " << sm_id_begin << ") {";
-    } else {
-        os << "if (blockIdx.x >= " << sm_id_begin << " && blockIdx.x < "
-           << sm_id_end << ") {";
-    }
-    os << " ark::sync_gpu<" << sm_id_end - sm_id_begin << ">(" ARK_LSS_NAME "_"
-       << stream_id << "); }\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::tensor(std::ostream &os,
-                                    const Tensor *tensor) const {
-    size_t off = this->get_tensor_offset(tensor);
-    os << "(" << tensor->type.type_str() << " *)";
-    std::string buf_name;
-    if (tensor->imported_rank >= 0) {
-        buf_name = ARK_BUF_NAME + std::to_string(tensor->imported_rank);
-    } else {
-        buf_name = "_buf";
-    }
-    os << "&" << buf_name << "[" << off << "]";
-    return os;
-}
-
-std::ostream &CodeGenerator::def_oparg(std::ostream &os, const OpArg &arg,
-                                       const std::string &name) const {
-    if (arg.type == OP_ARG_TENSOR) {
-        Tensor *tns;
-        arg.get(&tns);
-        os << tns->type.type_str() << " *" << name;
-    } else if (arg.type == OP_ARG_FLOAT) {
-        os << "float " << name;
-    } else if (arg.type == OP_ARG_INT) {
-        os << "int " << name;
-    } else if (arg.type == OP_ARG_BOOL) {
-        os << "bool " << name;
-    } else if (arg.type == OP_ARG_INT64) {
-        os << "long long int " << name;
-    } else if (arg.type == OP_ARG_UINT64) {
-        os << "uint64_t " << name;
-    } else {
-        ERR(SchedulerError, "Not implemented");
-    }
-    return os;
-}
-
-std::ostream &CodeGenerator::oparg(std::ostream &os, const OpArg &arg) const {
-    if (arg.type == OP_ARG_TENSOR) {
-        Tensor *tns;
-        arg.get(&tns);
-        this->tensor(os, tns);
-    } else if (arg.type == OP_ARG_FLOAT) {
-        float val;
-        arg.get(&val);
-        os << val;
-    } else if (arg.type == OP_ARG_INT) {
-        int val;
-        arg.get(&val);
-        os << val;
-    } else if (arg.type == OP_ARG_BOOL) {
-        bool val;
-        arg.get(&val);
-        os << val;
-    } else if (arg.type == OP_ARG_INT64) {
-        long long int val;
-        arg.get(&val);
-        os << val;
-    } else if (arg.type == OP_ARG_UINT64) {
-        uint64_t val;
-        arg.get(&val);
-        os << val;
-    } else {
-        ERR(SchedulerError, "Not implemented");
-    }
-    return os;
-}
-
-std::ostream &CodeGenerator::branch(std::ostream &os, const Branch &br,
-                                    int prev_sm_id_end) const {
-    if (br.warp_branches.empty()) {
-        return os;
-    }
-    if (prev_sm_id_end < 0) {
-        prev_sm_id_end = this->sm_num;
-    }
-    if (br.sm_id_begin == 0) {
-        if (br.sm_id_end == this->sm_num) {
-            os << "\n  { // for all SMs";
-        } else {
-            os << "\n  if (blockIdx.x < " << br.sm_id_end << ") {";
-        }
-    } else if (br.sm_id_begin == prev_sm_id_end) {
-        if (br.sm_id_end == this->sm_num) {
-            os << "  else {";
-        } else {
-            os << "  else if (blockIdx.x < " << br.sm_id_end << ") {";
-        }
-    } else if (br.sm_id_begin < prev_sm_id_end) {
-        if (br.sm_id_begin == br.sm_id_end) {
-            os << "\n  if (blockIdx.x == " << br.sm_id_begin << ") {";
-        } else {
-            os << "\n  if (blockIdx.x >= " << br.sm_id_begin
-               << " && blockIdx.x < " << br.sm_id_end << ") {";
-        }
-    } else {
-        if (br.sm_id_begin == br.sm_id_end) {
-            os << "  else if (blockIdx.x == " << br.sm_id_begin << ") {";
-        } else {
-            os << "  else if (blockIdx.x >= " << br.sm_id_begin
-               << " && blockIdx.x < " << br.sm_id_end << ") {";
-        }
-    }
-
-    int tpw = this->gpu_info.threads_per_warp;
-
-    for (auto &warp_branch : br.warp_branches) {
-        if (warp_branch.branch_ops.empty()) continue;
-        int thread_begin = warp_branch.warp_id_begin * tpw;
-        int thread_end = warp_branch.warp_id_end * tpw;
-        if (warp_branch.warp_id_begin == 0) {
-            if (warp_branch.warp_id_end == this->num_warps_per_sm) {
-                os << "\n    { // for all threads\n";
-            } else {
-                os << "\n    if (threadIdx.x < " << thread_end << ") {\n";
-            }
-        } else {
-            os << "\n    if (threadIdx.x >= " << thread_begin
-               << " && threadIdx.x < " << thread_end << ") {\n";
-        }
-
-        int num_warps = warp_branch.warp_id_end - warp_branch.warp_id_begin;
-
-        auto get_indexing = [&](int num_warps_per_uop) -> std::string {
-            int num_uops = num_warps / num_warps_per_uop;
-            int num_threads_per_uop = num_warps_per_uop * tpw;
-            std::stringstream thread_indexing;
-            if (thread_end - thread_begin > num_threads_per_uop) {
-                if (thread_begin > 0) {
-                    thread_indexing << "((threadIdx.x - " << thread_begin
-                                    << ")";
-                } else {
-                    thread_indexing << "(threadIdx.x";
-                }
-                if (math::is_pow2(num_threads_per_uop)) {
-                    thread_indexing << " >> "
-                                    << math::ilog2(num_threads_per_uop) << ")";
-                } else {
-                    thread_indexing << " / " << num_threads_per_uop << ")";
-                }
-            }
-            auto thread_indexing_str = thread_indexing.str();
-
-            std::stringstream sm_indexing;
-            if (br.sm_id_end - br.sm_id_begin > 1) {
-                if (br.sm_id_begin > 0) {
-                    sm_indexing << "((blockIdx.x - " << br.sm_id_begin << ")";
-                } else {
-                    sm_indexing << "(blockIdx.x";
-                }
-                if (num_uops > 1) {
-                    sm_indexing << " * " << num_uops;
-                }
-                sm_indexing << ")";
-            }
-            auto sm_indexing_str = sm_indexing.str();
-
-            std::string indexing;
-            if (thread_indexing_str.empty()) {
-                indexing = sm_indexing_str;
-            } else if (sm_indexing_str.empty()) {
-                indexing = thread_indexing_str;
-            } else {
-                indexing =
-                    "(" + sm_indexing_str + " + " + thread_indexing_str + ")";
-            }
-            return indexing;
-        };
-
-        auto uop_code = [&](int opseq_id, int uop_id_diff,
-                            int num_warps_per_uop,
-                            const std::string &uop_id_begin) -> std::string {
-            // num_uops = (warp_id_end - warp_id_begin) / num_warps_per_uop;
-            // warp_idx = warp_id - warp_id_begin;
-            // sm_idx = sm_id - sm_id_begin;
-            // uop = uop_id_diff * (warp_idx / num_warps_per_uop +
-            //                      num_uops * sm_idx) + uop_id_begin;
-            std::stringstream ss;
-            ss << OP_PREFIX << opseq_id << "(_buf, ";
-            if (uop_id_diff != 0) {
-                auto indexing = get_indexing(num_warps_per_uop);
-                if (!indexing.empty()) {
-                    if (uop_id_diff != 1) {
-                        ss << uop_id_diff << " * ";
-                    }
-                    ss << indexing << " + ";
-                }
-            }
-            ss << uop_id_begin << ", " << br.smem_bytes_per_warp << ");";
-            return ss.str();
-        };
-
-        if (ALLOW_FOR_LOOP == 0 || warp_branch.branch_ops.size() < 3) {
-            for (auto &branch_op : warp_branch.branch_ops) {
-                os << "      "
-                   << uop_code(branch_op.opseq_id, branch_op.uop_id_diff,
-                               branch_op.num_warps_per_uop,
-                               std::to_string(branch_op.uop_id_begin))
-                   << "\n";
-            }
-        } else {
-            size_t idx = 0;
-            while (idx < warp_branch.branch_ops.size() - 1) {
-                int opseq_id = warp_branch.branch_ops[idx].opseq_id;
-                int num_warps_per_uop =
-                    warp_branch.branch_ops[idx].num_warps_per_uop;
-                int uop_id_diff = warp_branch.branch_ops[idx].uop_id_diff;
-                int uop_id_begin = warp_branch.branch_ops[idx].uop_id_begin;
-                int uop_id_begin_diff =
-                    warp_branch.branch_ops[idx + 1].uop_id_begin -
-                    warp_branch.branch_ops[idx].uop_id_begin;
-                size_t idx2 = idx + 1;
-                for (; idx2 < warp_branch.branch_ops.size(); ++idx2) {
-                    auto &branch_op = warp_branch.branch_ops[idx2];
-                    if (branch_op.opseq_id != opseq_id ||
-                        branch_op.num_warps_per_uop != num_warps_per_uop ||
-                        branch_op.uop_id_diff != uop_id_diff ||
-                        branch_op.uop_id_begin !=
-                            (int)(uop_id_begin +
-                                  uop_id_begin_diff * (idx2 - idx))) {
-                        break;
-                    }
-                }
-                if (idx2 - idx > 2) {
-                    os << "      for (int _i = " << uop_id_begin << "; _i < "
-                       << uop_id_begin + (idx2 - idx) * uop_id_begin_diff
-                       << "; _i += " << uop_id_begin_diff << ") { "
-                       << uop_code(opseq_id, uop_id_diff, num_warps_per_uop,
-                                   "_i")
-                       << " }\n";
-                    idx = idx2;
-                } else {
-                    os << "      "
-                       << uop_code(opseq_id, uop_id_diff, num_warps_per_uop,
-                                   std::to_string(uop_id_begin))
-                       << "\n";
-                    ++idx;
-                }
-            }
-            if (idx < warp_branch.branch_ops.size()) {
-                auto &branch_op = warp_branch.branch_ops[idx];
-                os << "      "
-                   << uop_code(branch_op.opseq_id, branch_op.uop_id_diff,
-                               branch_op.num_warps_per_uop,
-                               std::to_string(branch_op.uop_id_begin))
-                   << "\n";
-            }
-        }
-        os << "    }\n";
-    }
-    os << "  }\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::def_uop(std::ostream &os, const SchedOp &sop,
-                                     int uop_id) const {
-    std::string uop_name = UNIT_OP_PREFIX + std::to_string(uop_id);
-    std::string func_name = sop.function_name();
-    assert(!func_name.empty());
-
-    const Op *op = sop.get_op();
-    if (op->force_inline) {
-        os << "DEVICE ";
-    } else {
-        os << "__noinline__ __device__ ";
-    }
-    os << "void " << uop_name << "(";
-
-    OpArgs call_args = op->function_call_args(*sop.get_cfg());
-    int cnt_param = 0;
-    for (const OpArg &arg : call_args.get_args()) {
-        this->def_oparg(os, arg, "_" + std::to_string(cnt_param)) << ", ";
-        ++cnt_param;
-    }
-
-    os << "int _uop_idx, int _smem_per_warp) {\n";
-    os << "  " << func_name << "(";
-
-    for (int i = 0; i < cnt_param; ++i) {
-        os << '_' << i << ", ";
-    }
-    os << "_uop_idx, _smem_per_warp);\n}\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::uop(std::ostream &os, int uop_id) const {
-    os << UNIT_OP_PREFIX << uop_id;
-    return os;
-}
-
-//
-std::ostream &CodeGenerator::opseq(std::ostream &os, const std::string &name,
-                                   const SchedOpSeq &opseq,
-                                   std::map<std::string, int> &uop_map) const {
-    auto &sched_ops = opseq.get_sched_ops();
-    unsigned int idx = sched_ops.size();
-    for (auto &sop : sched_ops) {
-        if (sop.is_virtual()) {
-            continue;
-        }
-        if (idx == sched_ops.size()) {
-            os << "// tile dims: (" << opseq.get_tdims()[0] << ", "
-               << opseq.get_tdims()[1] << ", " << opseq.get_tdims()[2] << ")\n"
-               << "__noinline__ __device__ void " << name
-               << "(char *_buf, int _uop_idx, int _smem_per_warp) {\n";
-        }
-        --idx;
-        os << "  ";
-        auto uop_map_it = uop_map.find(sop.serialize());
-        if (uop_map_it != uop_map.end()) {
-            this->uop(os, uop_map_it->second);
-        } else {
-            os << sop.function_name();
-        }
-        os << '(';
-
-        OpArgs call_args = sop.get_op()->function_call_args(*sop.get_cfg());
-        for (const OpArg &arg : call_args.get_args()) {
-            this->oparg(os, arg) << ", ";
-        }
-
-        os << "_uop_idx, _smem_per_warp);\n";
-    }
-    if (idx != sched_ops.size()) {
-        os << "}\n";
-    }
-    return os;
-}
-
-std::ostream &CodeGenerator::def_proxy_channels(std::ostream &os,
-                                                size_t num_channels) const {
-    if (num_channels == 0) {
-        return os;
-    }
-    os << "#include <mscclpp/proxy_channel_device.hpp>\n"
-          "__constant__ mscclpp::SimpleProxyChannelDeviceHandle "
-          "_ARK_PROXY_CHANS["
-       << num_channels << "];\n";
-    return os;
-}
-
-std::ostream &CodeGenerator::def_sm_channels(std::ostream &os,
-                                             size_t num_channels) const {
-    if (num_channels == 0) {
-        return os;
-    }
-    os << "#include <mscclpp/sm_channel_device.hpp>\n"
-          "__constant__ mscclpp::SmChannelDeviceHandle "
-          "_ARK_SM_CHANS["
-       << num_channels << "];\n";
-    return os;
-}
-
-}  // namespace ark
diff --git a/ark/sched/sched_codegen.h b/ark/sched/sched_codegen.h
deleted file mode 100644
index 91577e672..000000000
--- a/ark/sched/sched_codegen.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_SCHED_CODEGEN_H_
-#define ARK_SCHED_CODEGEN_H_
-
-#include <map>
-
-#include "gpu/gpu_loop_kernel.h"
-#include "sched/sched_op.h"
-#include "sched/sched_opseq.h"
-#include "sched_branch.h"
-
-namespace ark {
-
-class CodeGenerator {
-   public:
-    CodeGenerator(const GpuManager::Info &gpu_info_, int num_warps_per_sm_);
-
-    std::ostream &def_remote_buf(std::ostream &os, int remote_rank) const;
-
-    std::ostream &sync_gpu(std::ostream &os) const;
-
-    std::ostream &def_sync_stream(std::ostream &os, int stream_id) const;
-    std::ostream &sync_stream(std::ostream &os, int stream_id, int sm_id_begin,
-                              int sm_id_end) const;
-
-    std::ostream &tensor(std::ostream &os, const Tensor *tensor) const;
-
-    std::ostream &def_oparg(std::ostream &os, const OpArg &arg,
-                            const std::string &name) const;
-    std::ostream &oparg(std::ostream &os, const OpArg &arg) const;
-
-    std::ostream &branch(std::ostream &os, const Branch &branch,
-                         int prev_sm_id_end = -1) const;
-
-    std::ostream &def_uop(std::ostream &os, const SchedOp &sop,
-                          int uop_id) const;
-
-    std::ostream &uop(std::ostream &os, int uop_id) const;
-
-    std::ostream &opseq(std::ostream &os, const std::string &name,
-                        const SchedOpSeq &opseq,
-                        std::map<std::string, int> &uop_map) const;
-
-    std::ostream &def_proxy_channels(std::ostream &os,
-                                     size_t num_channels) const;
-
-    std::ostream &def_sm_channels(std::ostream &os, size_t num_channels) const;
-
-   protected:
-    size_t get_tensor_offset(const Tensor *tensor) const;
-
-    const GpuManager::Info &gpu_info;
-    int sm_num;
-    int num_warps_per_sm;
-    int world_size;
-    int num_indent;
-};
-
-}  // namespace ark
-
-#endif  // ARK_SCHED_CODEGEN_H_
diff --git a/ark/sched/sched_op.cc b/ark/sched/sched_op.cc
index a6c4603e9..151961c2d 100644
--- a/ark/sched/sched_op.cc
+++ b/ark/sched/sched_op.cc
@@ -38,7 +38,8 @@ const string SchedOp::serialize() const {
     ss << this->function_name() << ",";
 
     OpArgs call_args = this->get_op()->function_call_args(*(this->get_cfg()));
-    for (const OpArg &arg : call_args.get_args()) {
+    for (const auto &p : call_args.get_args()) {
+        const OpArg &arg = p.second;
         if (arg.type == OP_ARG_TENSOR) {
             Tensor *tns;
             arg.get(&tns);
diff --git a/ark/sched/sched_op.h b/ark/sched/sched_op.h
index 196df0fc2..19eb5dff7 100644
--- a/ark/sched/sched_op.h
+++ b/ark/sched/sched_op.h
@@ -4,9 +4,6 @@
 #ifndef ARK_SCHED_OP_H_
 #define ARK_SCHED_OP_H_
 
-#include "include/ark.h"
-#include "ops/ops_common.h"
-
 namespace ark {
 
 class SchedOp {
diff --git a/ark/schedule/schedule.cc b/ark/schedule/schedule.cpp
similarity index 98%
rename from ark/schedule/schedule.cc
rename to ark/schedule/schedule.cpp
index 1f5febc02..9ae7cd5aa 100644
--- a/ark/schedule/schedule.cc
+++ b/ark/schedule/schedule.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "schedule.h"
+#include "ark/schedule.hpp"
 
 #include "nlohmann/json.hpp"
 
diff --git a/ark/unique_list.hpp b/ark/unique_list.hpp
new file mode 100644
index 000000000..6a08822a5
--- /dev/null
+++ b/ark/unique_list.hpp
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_UNIQUE_LIST_HPP_
+#define ARK_UNIQUE_LIST_HPP_
+
+#include <cstddef>
+#include <list>
+#include <map>
+#include <vector>
+
+namespace ark {
+
+template <typename T>
+class UniqueList {
+   private:
+    std::list<T> list_;
+    std::map<T, typename std::list<T>::iterator> index_;
+
+   public:
+    UniqueList() = default;
+
+    explicit UniqueList(const std::vector<T> &vec) {
+        for (const auto &value : vec) {
+            push_back(value);
+        }
+    }
+
+    UniqueList(const UniqueList &other) = default;
+
+    UniqueList(UniqueList &&other) = default;
+
+    UniqueList &operator=(const UniqueList &other) = default;
+
+    UniqueList &operator=(UniqueList &&other) = default;
+
+    const T &front() const { return list_.front(); }
+
+    const T &back() const { return list_.back(); }
+
+    const T &operator[](size_t idx) const {
+        auto it = list_.begin();
+        std::advance(it, idx);
+        return *it;
+    }
+
+    void push_back(const T &value) {
+        auto it = index_.find(value);
+        if (it == index_.end()) {
+            list_.push_back(value);
+            index_[value] = --list_.end();
+        }
+    }
+
+    void erase(const T &value) {
+        auto it = index_.find(value);
+        if (it != index_.end()) {
+            list_.erase(it->second);
+            index_.erase(it);
+        }
+    }
+
+    void erase(typename std::list<T>::iterator it) {
+        index_.erase(*it);
+        list_.erase(it);
+    }
+
+    void clear() {
+        list_.clear();
+        index_.clear();
+    }
+
+    size_t index(const T &value) const {
+        auto it = index_.find(value);
+        return (it == index_.end())
+                   ? -1
+                   : std::distance(
+                         list_.begin(),
+                         static_cast<typename std::list<T>::const_iterator>(
+                             it->second));
+    }
+
+    typename std::list<T>::iterator begin() { return list_.begin(); }
+
+    typename std::list<T>::const_iterator begin() const {
+        return list_.begin();
+    }
+
+    typename std::list<T>::iterator end() { return list_.end(); }
+
+    typename std::list<T>::const_iterator end() const { return list_.end(); }
+
+    typename std::list<T>::iterator find(const T &value) {
+        auto it = index_.find(value);
+        return (it == index_.end()) ? end() : it->second;
+    }
+
+    typename std::list<T>::const_iterator find(const T &value) const {
+        auto it = index_.find(value);
+        return (it == index_.end()) ? end() : it->second;
+    }
+
+    bool empty() const { return list_.empty(); }
+
+    bool contains(const T &value) const {
+        return index_.find(value) != index_.end();
+    }
+
+    size_t size() const { return index_.size(); }
+};
+
+}  // namespace ark
+
+#endif  // ARK_UNIQUE_LIST_HPP_
diff --git a/ark/unique_list_test.cpp b/ark/unique_list_test.cpp
new file mode 100644
index 000000000..6df6d478e
--- /dev/null
+++ b/ark/unique_list_test.cpp
@@ -0,0 +1,80 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "unique_list.hpp"
+
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_unique_list() {
+    ark::UniqueList<int> list;
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+    UNITTEST_EQ(list.size(), 3);
+    UNITTEST_EQ(list[0], 1);
+    UNITTEST_EQ(list[1], 2);
+    UNITTEST_EQ(list[2], 3);
+
+    list.clear();
+    UNITTEST_EQ(list.size(), 0);
+
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+    list.push_back(4);
+    UNITTEST_EQ(list.size(), 4);
+    UNITTEST_EQ(list[0], 1);
+    UNITTEST_EQ(list[1], 2);
+    UNITTEST_EQ(list[2], 3);
+    UNITTEST_EQ(list[3], 4);
+
+    list.clear();
+    UNITTEST_EQ(list.size(), 0);
+
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+
+    list.erase(1);
+    UNITTEST_EQ(list.size(), 2);
+    UNITTEST_EQ(list[0], 2);
+    UNITTEST_EQ(list[1], 3);
+
+    list.clear();
+    UNITTEST_EQ(list.size(), 0);
+
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+
+    list.erase(0);
+    UNITTEST_EQ(list.size(), 3);
+    UNITTEST_EQ(list[0], 1);
+    UNITTEST_EQ(list[1], 2);
+    UNITTEST_EQ(list[2], 3);
+
+    list.clear();
+    UNITTEST_EQ(list.size(), 0);
+
+    list.push_back(1);
+    list.push_back(2);
+    list.push_back(3);
+
+    list.erase(2);
+    UNITTEST_EQ(list.size(), 2);
+    UNITTEST_EQ(list[0], 1);
+    UNITTEST_EQ(list[1], 3);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_unique_list);
+    return 0;
+}
diff --git a/ark/unittest/unittest_utils.cc b/ark/unittest/unittest_utils.cpp
similarity index 76%
rename from ark/unittest/unittest_utils.cc
rename to ark/unittest/unittest_utils.cpp
index 56313647e..c3aa45056 100644
--- a/ark/unittest/unittest_utils.cc
+++ b/ark/unittest/unittest_utils.cpp
@@ -13,8 +13,6 @@
 #include "file_io.h"
 #include "logging.h"
 
-using namespace std;
-
 // Grep SIGALRM and exit.
 static void sigalrm_timeout_handler(int) {
     signal(SIGALRM, SIG_IGN);
@@ -26,8 +24,8 @@ namespace unittest {
 
 // Temporal unittest states.
 struct TempStates {
-    vector<int> pids;
-    vector<thread *> threads;
+    std::vector<int> pids;
+    std::vector<std::thread *> threads;
 };
 
 TempStates GLOBAL_TEMP_STATES_;
@@ -45,15 +43,15 @@ Timeout::~Timeout() {
 }
 
 // Spawn a thread that runs the given function.
-thread *spawn_thread(function<State()> func) {
-    thread *t = new thread(func);
+std::thread *spawn_thread(std::function<State()> func) {
+    std::thread *t = new std::thread(func);
     GLOBAL_TEMP_STATES_.threads.emplace_back(t);
     return t;
 }
 
 // Wait for all threads to finish.
 void wait_all_threads() {
-    for (thread *t : GLOBAL_TEMP_STATES_.threads) {
+    for (std::thread *t : GLOBAL_TEMP_STATES_.threads) {
         if (t->joinable()) {
             t->join();
         }
@@ -63,7 +61,7 @@ void wait_all_threads() {
 }
 
 // Spawn a process that runs the given function.
-int spawn_process(function<State()> func) {
+int spawn_process(std::function<State()> func) {
     pid_t pid = fork();
     if (pid < 0) {
         UNITTEST_UEXIT("fork() failed");
@@ -89,19 +87,19 @@ void wait_all_processes() {
         } while (!WIFEXITED(status));
         status = WEXITSTATUS(status);
         if (status != State::SUCCESS) {
-            UNITTEST_EXIT((State)status, "process " + to_string(pid));
+            UNITTEST_EXIT((State)status, "process " + std::to_string(pid));
         }
     }
     GLOBAL_TEMP_STATES_.pids.clear();
 }
 
 // Run the given test function.
-State test(function<State()> test_func) { return test_func(); }
+State test(std::function<State()> test_func) { return test_func(); }
 
 //
-string get_kernel_code(const string &name) {
-    return ark::read_file(ark::get_dir(string{__FILE__}) + "/../ops/kernels/" +
-                          name + ".h");
+std::string get_kernel_code(const std::string &name) {
+    return ark::read_file(ark::get_dir(std::string{__FILE__}) +
+                          "/../ops/kernels/" + name + ".h");
 }
 
 }  // namespace unittest
diff --git a/ark/unittest/unittest_utils.h b/ark/unittest/unittest_utils.h
index 44680f532..2b433f7fe 100644
--- a/ark/unittest/unittest_utils.h
+++ b/ark/unittest/unittest_utils.h
@@ -11,8 +11,8 @@
 #include <thread>
 #include <type_traits>
 
+#include "ark/init.hpp"
 #include "cpu_timer.h"
-#include "include/ark.h"
 #include "logging.h"
 
 namespace ark {
@@ -48,6 +48,7 @@ std::string get_kernel_code(const std::string &name);
 // Run the given test function.
 #define UNITTEST(test_func)                                          \
     do {                                                             \
+        ark::init();                                                 \
         LOG(ark::INFO, "unittest start: " #test_func);               \
         double _s = ark::cpu_timer();                                \
         ark::unittest::State _ret;                                   \
diff --git a/ark/version.cpp b/ark/version.cpp
new file mode 100644
index 000000000..f7556d8c4
--- /dev/null
+++ b/ark/version.cpp
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/version.hpp"
+
+#include <sstream>
+#include <string>
+
+namespace ark {
+
+std::string version() {
+    std::stringstream ss;
+    ss << ARK_MAJOR << "." << ARK_MINOR << "." << ARK_PATCH;
+    return ss.str();
+}
+
+}  // namespace ark
diff --git a/ark/version_test.cpp b/ark/version_test.cpp
new file mode 100644
index 000000000..b4a5cb825
--- /dev/null
+++ b/ark/version_test.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/version.hpp"
+
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_version() {
+    auto version = ark::version();
+
+    // Check if the version string is in the correct format.
+    auto dot1 = version.find('.');
+    auto dot2 = version.find('.', dot1 + 1);
+    UNITTEST_NE(dot1, std::string::npos);
+    UNITTEST_NE(dot2, std::string::npos);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_version);
+    return 0;
+}
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 22232b1b0..3ce14e8ec 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -428,23 +428,37 @@ def test_transformer_block(
         low=-1, high=1, size=(batch_size, seq_len, args.dim)
     ).astype(dtype)
 
-    test_module(
-        module_class_ark=model_ark.TransformerBlock,
-        module_args_ark=[
-            0,
-            args,
-            ark.DataType.from_numpy(dtype),
-            rank,
-            world_size,
-        ],
-        inputs_ark=[feature, 0, freqs_cis_ark, None],
-        module_class_pt=model_pt.TransformerBlock,
-        module_args_pt=[0, args],
-        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-        module_name_prefix="layers.0",
-        rank=rank,
-        world_size=world_size,
-    )
+    module = model_ark.Attention(args, ark.DataType.from_numpy(dtype), rank, world_size)
+        # module_inputs = [
+        #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
+        #     if isinstance(i, np.ndarray)
+        #     else i
+        #     for i in inputs
+        # ]
+    feature_tensor = ark.tensor(list(feature.shape), ark.DataType.from_numpy(feature.dtype))
+    freqs_cis_ark_tensor = ark.tensor(list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype))
+    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
+
+    ark.Model.get_model().create_nodes()
+    print(ark.Model.get_model().serialize())
+
+    # test_module(
+    #     module_class_ark=model_ark.TransformerBlock,
+    #     module_args_ark=[
+    #         0,
+    #         args,
+    #         ark.DataType.from_numpy(dtype),
+    #         rank,
+    #         world_size,
+    #     ],
+    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
+    #     module_class_pt=model_pt.TransformerBlock,
+    #     module_args_pt=[0, args],
+    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
+    #     module_name_prefix="layers.0",
+    #     rank=rank,
+    #     world_size=world_size,
+    # )
 
 
 def test_transformer(
@@ -514,8 +528,8 @@ def test(args, batch_size, seq_len, dtype, rank, world_size):
     # test_row_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_column_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_attention(args, batch_size, seq_len, dtype, rank, world_size)
-    # test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size)
-    test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
+    test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size)
+    # test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
 
 
 def worker(
diff --git a/examples/tutorial/quickstart_tutorial.py b/examples/tutorial/quickstart_tutorial.py
index da1894702..981435780 100644
--- a/examples/tutorial/quickstart_tutorial.py
+++ b/examples/tutorial/quickstart_tutorial.py
@@ -11,9 +11,9 @@ def quickstart_tutorial():
 
     M, N = 64, 64
     # Create an input tensor
-    input_tensor = ark.tensor([M, N], ark.fp32)
+    input_tensor = ark.tensor([M, N], ark.fp16)
     # Create another tensor
-    other_tensor = ark.tensor([M, N], ark.fp32)
+    other_tensor = ark.tensor([M, N], ark.fp16)
 
     # Add the two tensors
     output_tensor = ark.add(input_tensor, other_tensor)
@@ -25,9 +25,9 @@ def quickstart_tutorial():
     runtime.launch()
 
     # Initialize the input and other tensor with random values
-    input_tensor_host = np.random.rand(M, N).astype(np.float32)
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
     input_tensor.from_numpy(input_tensor_host)
-    other_tensor_host = np.random.rand(M, N).astype(np.float32)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
     other_tensor.from_numpy(other_tensor_host)
 
     # Run the ARK program
diff --git a/examples/tutorial/sched.json b/examples/tutorial/sched.json
new file mode 100644
index 000000000..add17313d
--- /dev/null
+++ b/examples/tutorial/sched.json
@@ -0,0 +1,60 @@
+{
+  "NumProcessors": 108,
+  "NumWarpsPerProcessor": 16,
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": {
+        "Begin": 0,
+        "End": 107
+      },
+      "ResourceGroups": [
+        {
+          "ProcessorRange": {
+            "Begin": 0,
+            "End": 64
+          },
+          "SramRange": {
+            "Begin": 0,
+            "End": 0
+          },
+          "TaskGroups": [
+            {
+              "TaskId": 0,
+              "TaskRange": {
+                "Begin": 0,
+                "End": 107
+              },
+              "TaskStride": 1
+            }
+          ],
+          "WarpRange": {
+            "Begin": 0,
+            "End": 1
+          }
+        }
+      ]
+    },
+    {
+      "ProcessorRange": {
+        "Begin": 107,
+        "End": 108
+      },
+      "ResourceGroups": null
+    },
+    {
+      "ProcessorRange": {
+        "Begin": 0,
+        "End": 108
+      },
+      "ResourceGroups": null
+    }
+  ],
+  "TaskInfos": [
+    {
+      "Detail": "ark::add<ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 64, 64>, ark::Vec<1, 1, 1, 64>, 1, 0>,fp16 *,fp16 *,fp16 *,;",
+      "Id": 0,
+      "NumWarps": 1,
+      "SramBytes": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c0686e254..9052b72b0 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -12,6 +12,10 @@ namespace py = pybind11;
 void register_model(py::module &m) {
     py::class_<ark::Model>(m, "_Model")
         .def(py::init<int>(), py::arg("rank") = 0)
+        .def("create_nodes", &ark::Model::create_nodes)
+        .def("serialize", &ark::Model::serialize,
+             py::return_value_policy::reference_internal,
+             py::arg("indent") = -1)
         .def("tensor", &ark::Model::tensor,
              "construct a tensor with given shape and data type.",
              py::return_value_policy::reference_internal, py::arg("shape"),