Skip to content

Commit

Permalink
fix(ci): [distributed CI] fix workflow and move build/test outside do…
Browse files Browse the repository at this point in the history
…cker (needed to run on multiple nodes).
  • Loading branch information
antoniupop committed Mar 28, 2024
1 parent 29e131a commit c4c04b1
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 89 deletions.
99 changes: 28 additions & 71 deletions .github/workflows/compiler_build_and_test_cpu_distributed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,9 @@ env:
jobs:
BuildAndTest:
name: Build and test compiler on Slurm cluster in EC2
runs-on: ${{ github.event.inputs.runner_name }}
runs-on: distributed-ci
if: ${{ !cancelled() }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
steps:

- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
Expand All @@ -45,81 +41,42 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
echo "Matrix item: ${{ inputs.matrix_item }}"
# SSH private key is required as some dependencies are from private repos
- name: Set up SSH agent
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
- name: Instance cleanup
run: |
docker system prune -af
- name: Fetch repository
uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: recursive
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

- name: Setup rust toolchain for concrete-cpu
uses: ./.github/workflows/setup_rust_toolchain_for_concrete_cpu

- name: Create build dir
run: mkdir build
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/shared" >> "${GITHUB_ENV}"
- name: Build compiler
uses: addnab/docker-run-action@v3
id: build-compiler
with:
registry: ghcr.io
image: ${{ env.DOCKER_IMAGE_TEST }}
username: ${{ secrets.GHCR_LOGIN }}
password: ${{ secrets.GHCR_PASSWORD }}
options: >-
-v ${{ github.workspace }}:/concrete
-v ${{ github.workspace }}/build:/build
-v ${{ github.workspace }}/wheels:/wheels
-v ${{ env.SSH_AUTH_SOCK }}:/ssh.socket
-e SSH_AUTH_SOCK=/ssh.socket
${{ env.DOCKER_GPU_OPTION }}
shell: bash
run: |
rustup toolchain install nightly-2024-01-31
set -e
cd /concrete/compilers/concrete-compiler/compiler
rm -rf /build/*
make DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build build-end-to-end-dataflow-tests
echo "Debug: ccache statistics (after the build):"
ccache -s
- name: Export specific variables (CPU)
if: ${{ !startswith(inputs.instance_type, 'p3.') }}
run: |
echo "CUDA_SUPPORT=OFF" >> "${GITHUB_ENV}"
echo "DATAFLOW_EXECUTION_ENABLED=ON" >> "${GITHUB_ENV}"
- name: Enable complete tests on push to main
if: github.ref == 'refs/heads/main'
run: echo "MINIMAL_TESTS=OFF" >> $GITHUB_ENV
- name: Setup rust toolchain for concrete-cpu
uses: ./.github/workflows/setup_rust_toolchain_for_concrete_cpu

- name: Enable minimal tests otherwise
if: github.ref != 'refs/heads/main'
run: echo "MINIMAL_TESTS=ON" >> $GITHUB_ENV
- name: Build compiler benchmarks
run: |
set -e
git config --global --add safe.directory '*'
cd compilers/concrete-compiler/compiler
rm -rf /shared/build
make HPX_DIR=/shared/hpx install-hpx-from-source
make HPX_DIR=/shared/hpx BUILD_DIR=/shared/build CCACHE=ON DATAFLOW_EXECUTION_ENABLED=ON BINDINGS_PYTHON_ENABLED=OFF CUDA_SUPPORT=${{ env.CUDA_SUPPORT }} build-end-to-end-dataflow-tests
- name: Test compiler
uses: addnab/docker-run-action@v3
with:
registry: ghcr.io
image: ${{ env.DOCKER_IMAGE_TEST }}
username: ${{ secrets.GHCR_LOGIN }}
password: ${{ secrets.GHCR_PASSWORD }}
options: >-
-v ${{ github.workspace }}:/concrete
-v ${{ github.workspace }}/build:/build
${{ env.DOCKER_GPU_OPTION }}
shell: bash
run: |
set -e
rustup toolchain install nightly-2024-01-31
cd /concrete/compilers/concrete-compiler/compiler
pip install pytest
dnf install -y libzstd libzstd-devel
sed "s/pytest/python -m pytest/g" -i Makefile
mkdir -p /tmp/concrete_compiler/gpu_tests/
make MINIMAL_TESTS=${{ env.MINIMAL_TESTS }} DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build run-end-to-end-distributed-tests
chmod -R ugo+rwx /tmp/KeySetCache
- name: Analyze logs
- name: Run end-to-end benchmarks
run: |
cd build/gtest-parallel-logs/passed
ls -1 | xargs grep -H "WARNING RETRY" | sed -e "s/.log.*//g" | uniq -c | sed -re "s/ *([0-9]*) (.*)/::warning ::Test \2 retried \1 times/g" | cat
set -e
cd compilers/concrete-compiler/compiler
make BUILD_DIR=/shared/build run-end-to-end-distributed-tests
2 changes: 1 addition & 1 deletion ci/slab.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ profile = "m7i-cpu-test"
check_run_name = "Compiler Build and Test (CPU)"

[command.compiler-cpu-build-distributed]
workflow = "compiler_build_and_test_cpu.yml"
workflow = "compiler_build_and_test_cpu_distributed.yml"
profile = "slurm-cluster"
check_run_name = "Compiler Distributed Build and Test (CPU)"
runner_name = "distributed-ci"
Expand Down
18 changes: 11 additions & 7 deletions compilers/concrete-compiler/compiler/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ KEYSETCACHENAME ?= KeySetCacheV4

HPX_VERSION?=1.9.1
HPX_URL=https://github.com/STEllAR-GROUP/hpx/archive/refs/tags/v$(HPX_VERSION).tar.gz
HPX_TARBALL=$(shell pwd)/hpx-$(HPX_VERSION).tar.gz
HPX_LOCAL_DIR=$(shell pwd)/hpx-$(HPX_VERSION)
HPX_DIR?=$(shell pwd)
HPX_TARBALL=hpx-$(HPX_VERSION).tar.gz
HPX_LOCAL_DIR=$(HPX_DIR)/hpx-$(HPX_VERSION)
HPX_INSTALL_DIR?=$(HPX_LOCAL_DIR)/build

ML_BENCH_SUBSET_ID=
Expand Down Expand Up @@ -133,10 +134,11 @@ install-hpx-from-source: $(HPX_LOCAL_DIR)
cd $(HPX_LOCAL_DIR)/build && make -j2

$(HPX_TARBALL):
curl -L $(HPX_URL) -o $(HPX_TARBALL)
mkdir -p $(HPX_DIR)
cd $(HPX_DIR) && curl -L $(HPX_URL) -o $(HPX_TARBALL)

$(HPX_LOCAL_DIR): $(HPX_TARBALL)
tar xzvf $(HPX_TARBALL)
cd $(HPX_DIR) && tar xzvf $(HPX_TARBALL)

$(BUILD_DIR)/configured.stamp:
mkdir -p $(BUILD_DIR)
Expand Down Expand Up @@ -338,9 +340,11 @@ run-end-to-end-dataflow-tests: build-end-to-end-dataflow-tests
$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_auto_parallelization
$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed

run-end-to-end-distributed-tests: build-end-to-end-dataflow-tests
srun -n4 -c8 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_auto_parallelization
srun -n4 -c8 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed
run-end-to-end-distributed-tests: $(GTEST_PARALLEL_PY) build-end-to-end-tests generate-cpu-tests
srun -n4 -c8 --kill-on-bad-exit=1 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed
srun -n4 -c8 --kill-on-bad-exit=1 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_test \
--optimizer-strategy=dag-mono --dataflow-parallelize=1 \
$(FIXTURE_CPU_DIR)/*round*.yaml $(FIXTURE_CPU_DIR)/*relu*.yaml $(FIXTURE_CPU_DIR)/*linalg*.yaml

# benchmark

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ func.func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>>
return %res_P : tensor<200x8x!FHE.eint<4>>
}
)XXX",
"main", false, true, true);
"main", false, true, true, DEFAULT_batchTFHEOps,
DEFAULT_global_p_error, DEFAULT_chunkedIntegers, DEFAULT_chunkSize,
DEFAULT_chunkWidth, DEFAULT_composable, false);

const size_t dim0 = 200;
const size_t dim1 = 4;
Expand All @@ -100,7 +102,7 @@ func.func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>>
ASSERT_EQ(result.dimensions, outputShape);
distributed_results = result.values;
} else {
ASSERT_OUTCOME_HAS_FAILURE(lambda.call({}));
ASSERT_OUTCOME_HAS_VALUE(lambda.call({}));
}
}

Expand All @@ -117,7 +119,10 @@ TEST(Distributed, nn_med_sequential) {
return %2 : tensor<200x8x!FHE.eint<4>>
}
)XXX",
"main", false, false, false);
"main", false, false, false, DEFAULT_batchTFHEOps,
DEFAULT_global_p_error, DEFAULT_chunkedIntegers,
DEFAULT_chunkSize, DEFAULT_chunkWidth, DEFAULT_composable,
false);

const size_t dim0 = 200;
const size_t dim1 = 4;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ bool DEFAULT_chunkedIntegers = false;
unsigned int DEFAULT_chunkSize = 4;
unsigned int DEFAULT_chunkWidth = 2;
bool DEFAULT_composable = false;
bool DEFAULT_use_multi_parameter = true;

// Jit-compiles the function specified by `func` from `src` and
// returns the corresponding lambda. Any compilation errors are caught
Expand All @@ -40,7 +41,8 @@ inline Result<TestProgram> internalCheckedJit(
bool chunkedIntegers = DEFAULT_chunkedIntegers,
unsigned int chunkSize = DEFAULT_chunkSize,
unsigned int chunkWidth = DEFAULT_chunkWidth,
bool composable = DEFAULT_composable) {
bool composable = DEFAULT_composable,
bool use_multi_parameter = DEFAULT_use_multi_parameter) {

auto options = mlir::concretelang::CompilationOptions();
options.optimizerConfig.global_p_error = global_p_error;
Expand All @@ -54,12 +56,7 @@ inline Result<TestProgram> internalCheckedJit(
}
options.loopParallelize = loopParallelize;
#ifdef CONCRETELANG_DATAFLOW_EXECUTION_ENABLED
#ifdef CONCRETELANG_DATAFLOW_TESTING_ENABLED
options.dataflowParallelize = true;
options.loopParallelize = true;
#else
options.dataflowParallelize = dataflowParallelize;
#endif
#endif
options.batchTFHEOps = batchTFHEOps;
if (composable) {
Expand All @@ -68,6 +65,9 @@ inline Result<TestProgram> internalCheckedJit(
mlir::concretelang::optimizer::Strategy::DAG_MULTI;
}

if (!use_multi_parameter)
options.optimizerConfig.strategy =
mlir::concretelang::optimizer::Strategy::DAG_MONO;
std::vector<std::string> sources = {src.str()};
TestProgram testProgram(options);
OUTCOME_TRYV(testProgram.compile({src.str()}));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <type_traits>

#include "concretelang/Common/Values.h"
#include "concretelang/Runtime/DFRuntime.hpp"
#include "concretelang/Support/CompilationFeedback.h"
#include "concretelang/TestLib/TestProgram.h"
#include "end_to_end_fixture/EndToEndFixture.h"
Expand Down Expand Up @@ -58,7 +59,12 @@ class EndToEndTest : public ::testing::Test {
void testOnce() {
for (auto tests_rep = 0; tests_rep <= options.numberOfRetry; tests_rep++) {
// We execute the circuit.
auto maybeRes = testCircuit->call(args);
auto maybeRes =
testCircuit->call((mlir::concretelang::dfr::_dfr_is_root_node())
? args
: std::vector<Value>());
if (!mlir::concretelang::dfr::_dfr_is_root_node())
return;
ASSERT_OUTCOME_HAS_VALUE(maybeRes);
auto result = maybeRes.value();

Expand Down

0 comments on commit c4c04b1

Please sign in to comment.