Skip to content

Commit

Permalink
Plan file support (#205)
Browse files Browse the repository at this point in the history
* Completely separate model definition from implementation
* Plan file details full implementation
* Models and plans are serialized into JSON
* Deeper integration with MSCCL++
* A few bug fixes
  • Loading branch information
chhwang authored May 25, 2024
1 parent e476238 commit e3cc6e3
Show file tree
Hide file tree
Showing 330 changed files with 15,885 additions and 42,617 deletions.
14 changes: 13 additions & 1 deletion .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@ flag_management:
carryforward: true
paths:
- ark/
- python/ark/

coverage:
status:
project:
default:
target: 80%
target: 85%
threshold: 1%

ignore:
- "/usr/*"
- "/tmp/*"
- "*/build/*"
- "*/dist-packages/*"
- "*/third_party/*"
- "*/ark/*_test.*"
- "*/examples/*"
- "*/python/unittest/*"
- "*/ark/unittest/*"
13 changes: 7 additions & 6 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ on:
- cron: '42 20 * * 4'

jobs:
analyze:
name: Analyze
analyze-cuda:
name: Analyze (CUDA)
strategy:
fail-fast: false
matrix:
language: [ 'cpp' ]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-cuda-${{ github.ref }}
cancel-in-progress: true
runs-on: ubuntu-latest
container:
Expand All @@ -38,7 +38,7 @@ jobs:
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}

Expand All @@ -48,10 +48,11 @@ jobs:
- name: Build
run: |
cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
mkdir build && cd build
cmake -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j build
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"
52 changes: 33 additions & 19 deletions .github/workflows/ut-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,41 +36,55 @@ jobs:
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: UpdateSubmodules
- name: Dubious ownership exception
run: |
git config --global --add safe.directory /__w/ark/ark
git submodule foreach --recursive git reset --hard
git submodule foreach --recursive git clean -fdx
git submodule foreach git fetch
git submodule update --init --recursive
- name: Build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug ..
make -j ut
cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON ..
make -j ut ark_py
- name: RunUT
run: |
cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random
- name: ReportCoverage
- name: Run C++ UT
run: |
cd build
lcov --capture --directory . --output-file coverage.info
lcov --remove coverage.info \
ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random
lcov --capture --directory . --output-file cpp_coverage.info
lcov --remove cpp_coverage.info \
'/usr/*' \
'/tmp/*' \
'*/build/*' \
'*/third_party/*' \
'*/ark/*_test.*' \
'*/examples/*' \
'*/python/*' \
'*/ark/unittest/unittest_utils.cc' \
--output-file coverage.info
lcov --list coverage.info
bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports"
'*/ark/unittest/unittest_utils.cpp' \
--output-file cpp_coverage.info
lcov --list cpp_coverage.info
- name: BuildPython
- name: Install Python Dependencies
run: |
python3 -m pip install -r requirements.txt
- name: Run Python UT
run: |
cd build
ARK_ROOT=$PWD pytest --cov --verbose ../python/unittest/test.py
- name: Report Coverage
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
run: |
cd build
bash <(curl -s https://codecov.io/bash) -f cpp_coverage.info || echo "Codecov did not collect C++ coverage reports"
bash <(curl -s https://codecov.io/bash) -f .coverage || echo "Codecov did not collect Python coverage reports"
- name: Install Python
run: |
python3 -m pip install .
- name: Run Tutorials
run: |
python3 ./examples/tutorial/quickstart_tutorial.py
python3 ./examples/tutorial/plan_tutorial.py
6 changes: 1 addition & 5 deletions .github/workflows/ut-rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v4

- name: UpdateSubmodules
- name: Dubious ownership exception
run: |
git config --global --add safe.directory /__w/ark/ark
git submodule foreach --recursive git reset --hard
git submodule foreach --recursive git clean -fdx
git submodule foreach git fetch
git submodule update --init --recursive
- name: Build
run: |
Expand Down
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@
[submodule "third_party/mscclpp"]
path = third_party/mscclpp
url = https://github.com/microsoft/mscclpp

[submodule "third_party/json"]
path = third_party/json
url = https://github.com/nlohmann/json
14 changes: 7 additions & 7 deletions ark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cc *_test.cu)
file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cc)
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cpp)
file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp)
file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp)
list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})

if(USE_ROCM)
file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
endif()

set(COMMON_LIBS ARK::numa ARK::ibverbs mscclpp mscclpp_static pthread rt)
set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt)

# ARK object
target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_include_directories(ark_obj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(ark_obj SYSTEM PRIVATE
${PROJECT_SOURCE_DIR}/third_party/json
${JSON_INCLUDE_DIRS}
${MSCCLPP_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS}
Expand All @@ -42,7 +42,7 @@ if(USE_ROCM)
endif()

target_sources(ark_obj PRIVATE ${SOURCES})
target_link_libraries(ark_obj PRIVATE ${COMMON_LIBS})
target_link_libraries(ark_obj PUBLIC mscclpp_static PRIVATE ${COMMON_LIBS})

# ARK unit tests
foreach(ut_source IN ITEMS ${UT_SOURCES})
Expand All @@ -52,7 +52,7 @@ foreach(ut_source IN ITEMS ${UT_SOURCES})
set_target_properties(${exe_name} PROPERTIES EXCLUDE_FROM_ALL TRUE)
target_include_directories(${exe_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(${exe_name} SYSTEM PRIVATE
${PROJECT_SOURCE_DIR}/third_party/json
${JSON_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS}
)
Expand Down
62 changes: 62 additions & 0 deletions ark/api/data_type.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include "ark/data_type.hpp"

#include <map>

#include "bfloat16.h"
#include "half.h"
#include "logging.h"
#include "model/model_data_type.hpp"

namespace ark {

///
/// NOTE: how to add a new data type
/// 1. Add an instance using `DATA_TYPE_INSTANCE()` macro.
/// 2. Add a registration using `DATA_TYPE_REGISTER()` macro.
/// 3. Expose the symbol in `include/ark/data_type.hpp`.
///

#define DATA_TYPE_INSTANCE(_name, _type) \
extern const DataType _name( \
std::make_shared<ModelDataT>(#_name, #_type, sizeof(_type)));

#define DATA_TYPE_REGISTER(_name) instances[#_name] = &_name;

extern const DataType NONE(std::make_shared<ModelDataT>("NONE", "void", 0));
DATA_TYPE_INSTANCE(FP32, float);
DATA_TYPE_INSTANCE(FP16, fp16);
DATA_TYPE_INSTANCE(BF16, bf16);
DATA_TYPE_INSTANCE(INT32, int32_t);
DATA_TYPE_INSTANCE(UINT32, uint32_t);
DATA_TYPE_INSTANCE(INT8, int8_t);
DATA_TYPE_INSTANCE(UINT8, uint8_t);
DATA_TYPE_INSTANCE(BYTE, char);

const DataType &DataType::from_name(const std::string &type_name) {
static std::map<std::string, const DataType *> instances;
if (instances.empty()) {
DATA_TYPE_REGISTER(NONE);
DATA_TYPE_REGISTER(FP32);
DATA_TYPE_REGISTER(FP16);
DATA_TYPE_REGISTER(BF16);
DATA_TYPE_REGISTER(INT32);
DATA_TYPE_REGISTER(UINT32);
DATA_TYPE_REGISTER(INT8);
DATA_TYPE_REGISTER(UINT8);
DATA_TYPE_REGISTER(BYTE);
}
auto it = instances.find(type_name);
if (it == instances.end()) {
ERR(InvalidUsageError, "Unknown data type: ", type_name);
}
return *(it->second);
}

size_t DataType::bytes() const { return ref_->bytes(); }

const std::string &DataType::name() const { return ref_->type_name(); }

} // namespace ark
Loading

0 comments on commit e3cc6e3

Please sign in to comment.