diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 00000000..e834b4a2 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,53 @@ +name: Release + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + release-build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Build release distributions + run: | + # NOTE: put your own distribution build steps here. + python -m pip install build + python -m build + + - name: Upload distributions + uses: actions/upload-artifact@v4 + with: + name: release-dists + path: dist/ + + pypi-publish: + runs-on: ubuntu-latest + + needs: + - release-build + + permissions: + id-token: write + + steps: + - name: Retrieve release distributions + uses: actions/download-artifact@v4 + with: + name: release-dists + path: dist/ + + - name: Publish release distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_DLIO_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..8fe5ce04 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,241 @@ +name: Build and Test + +on: + pull_request: + branches: [main, dev] + push: + +jobs: + build-and-test: + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04] + gcc: [10] + python: ["3.9", "3.10", "3.11"] + venv: ["via-setup", "via-reqs"] + name: ${{ matrix.os }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ matrix.venv }} + runs-on: ${{ matrix.os }} + env: + CC: gcc-${{ matrix.gcc }} + CXX: g++-${{ matrix.gcc }} + DFTRACER_BUILD_TYPE: "Debug" + DFTRACER_ENABLE: 1 + DFTRACER_LOG_LEVEL: "DEBUG" + DLIO_EXEC: ${{ matrix.venv == 'via-setup' && 'dlio_benchmark' || 'python dlio_benchmark/main.py' }} + GOTCHA_DEBUG: 3 + OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 + PYTHON_VER: ${{ matrix.python }} + RDMAV_FORK_SAFE: "1" + VENV_PATH: "/home/runner/work/.venv/${{ matrix.venv }}" + steps: + - name: Clear disc + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Push checkout + if: github.event_name == 'push' + uses: actions/checkout@v3 + - name: PR checkout + if: github.event_name == 'pull_request' + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python }} + - name: Add current directory to PYTHONPATH + if: matrix.venv == 'via-reqs' + run: echo "PYTHONPATH=$(pwd):$PYTHONPATH" >> $GITHUB_ENV + - name: Cache install modules + id: cache-modules + uses: actions/cache@v3 + with: + path: ${{ env.VENV_PATH }} + key: ${{ matrix.venv }}-gcc${{ matrix.gcc }}-python${{ matrix.python }}-${{ hashFiles('requirements.txt', 'setup.py') }} + - name: Install system dependencies + run: | + sudo apt update + sudo apt-get install -y $CC $CXX libc6 git + sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev python3-dev + - name: Install DLIO via setup.py + if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true' + run: | + echo "venv: ${VENV_PATH} - gcc: $CC" + python -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate + pip install --upgrade pip + pip install .[test] + - name: Install DLIO via requirements.txt + if: matrix.venv == 'via-reqs' && steps.cache-modules.outputs.cache-hit != 'true' + run: | + echo "venv: ${VENV_PATH} - gcc: $CC" + python -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + - name: test_gen_data + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v + rm -rf data + - name: test_custom_storage_root_gen_data + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v + rm -rf data + - name: test_train + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v + mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v + rm -rf data + - name: test_custom_storage_root_train + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v + mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v + rm -rf data + - name: test_checkpoint_epoch + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v + mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v + rm -rf data + - name: test_checkpoint_step + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_checkpoint_step -v + - name: test_eval + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_eval -v + - name: test_multi_threads + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v + mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v + mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v + rm -rf data + - name: test-pytorch-multiprocessing-context + run: | + source ${VENV_PATH}/bin/activate + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v + mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v + rm -rf data + - name: test_subset + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 pytest -k test_subset -v + rm -rf data + - name: test-tf-loader-tfrecord + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 + rm -rf data + - name: test-torch-loader-npz + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + rm -rf data + - name: test-tf-loader-npz + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + rm -rf data + - name: test_unet3d + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 + mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic + rm -rf data + - name: test_resnet50 + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 + mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic + rm -rf data + - name: test_cosmoflow + run: | + source ${VENV_PATH}/bin/activate + rm -rf output data checkpoints + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 + mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic + rm -rf data diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml index 07173e34..bdb2ab26 100644 --- a/.github/workflows/jekyll-gh-pages.yml +++ b/.github/workflows/jekyll-gh-pages.yml @@ -1,5 +1,5 @@ # Sample workflow for building and deploying a Jekyll site to GitHub Pages -name: Deploy Jekyll with GitHub Pages dependencies preinstalled +name: Deploy Documentation on: # Runs on pushes targeting the default branch @@ -51,5 +51,5 @@ jobs: - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v1 - with: + with: folder: _build/html/ diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml deleted file mode 100644 index 78012e0e..00000000 --- a/.github/workflows/python-package-conda.yml +++ /dev/null @@ -1,234 +0,0 @@ -name: Python Package using Conda - -on: - pull_request: - branches: [ main, dev ] - push: - -jobs: - build-and-test: - strategy: - fail-fast: false - matrix: - os: [ ubuntu-20.04, ubuntu-22.04 ] - profiler: [ 0, 1 ] - gcc: [10] - python: ["3.8", "3.9", "3.10" ] - name: ${{ matrix.os }}-${{ matrix.profiler }}-${{ matrix.gcc }}-${{ matrix.python }} - runs-on: ${{ matrix.os }} - env: - VENV: "/home/runner/work/venv" - DLIO_PROFILER_ENABLE: ${{ matrix.profiler }} - CC: gcc-${{ matrix.gcc }} - CXX: g++-${{ matrix.gcc }} - RDMAV_FORK_SAFE: "1" - PYTHON_VER: ${{ matrix.python }} - DLIO_PROFILER_LOG_LEVEL: "INFO" - GOTCHA_DEBUG: 3 - steps: - - name: clear disc - run: | - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - name: Push checkout - if: github.event_name == 'push' - uses: actions/checkout@v3 - - name: PR checkout - if: github.event_name == 'pull_request' - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.head.sha }} - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python }} - - name: Cache install modules - id: cache-modules - uses: actions/cache@v3 - with: - path: ${{ env.VENV }} - key: ${{env.VENV }}-${{env.DLIO_PROFILER}}-${{ matrix.gcc }}-${{ matrix.python }}-${{ hashFiles('setup.py') }} - - name: Install System Tools - run: | - sudo apt update - sudo apt-get install $CC $CXX libc6 git - sudo apt-get install mpich libhwloc-dev - - name: Install DLIO code only - if: steps.cache-modules.outputs.cache-hit == 'true' - run: | - source ${VENV}/bin/activate - rm -rf *.egg* - rm -rf build - rm -rf dist - pip uninstall -y dlio_benchmark - python setup.py build - python setup.py install - - name: Install DLIO - if: steps.cache-modules.outputs.cache-hit != 'true' - run: | - echo "Profiler ${DLIO_PROFILER} gcc $CC" - python -m pip install --upgrade pip - pip install virtualenv - python -m venv ${VENV} - source ${VENV}/bin/activate - pip install .[test] - - name: Install DLIO Profiler - run: | - echo "Profiler ${DLIO_PROFILER} gcc $CC" - source ${VENV}/bin/activate - pip install --force-reinstall dlio_profiler_py - - name: test_gen_data - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v - rm -rf data - - name: test_custom_storage_root_gen_data - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v - rm -rf data - - name: test_train - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v - mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v - rm -rf data - - name: test_custom_storage_root_train - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v - mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v - rm -rf data - - name: test_checkpoint_epoch - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v - mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v - rm -rf data - - name: test_checkpoint_step - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_checkpoint_step -v - - name: test_eval - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_eval -v - - name: test_multi_threads - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_multi_threads[tensorflow-0] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-1] -v - mpirun -np 2 pytest -k test_multi_threads[tensorflow-2] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-0] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-1] -v - mpirun -np 2 pytest -k test_multi_threads[pytorch-2] -v - rm -rf data - - name: test-pytorch-multiprocessing-context - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v - mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v - rm -rf data - - name: test-tf-loader-tfrecord - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 - mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1 - rm -rf data - - name: test-torch-loader-npz - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - rm -rf data - - name: test-tf-loader-npz - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - rm -rf data - - name: test_subset - run: | - source ${VENV}/bin/activate - mpirun -np 2 pytest -k test_subset -v - - name: test_unet3d - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 - mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic - rm -rf data - - name: test_resnet50 - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 - mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic - rm -rf data - - name: test_cosmoflow - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 - mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic - rm -rf data \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 49623d51..3ee4b4c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ -include requirements.txt -recursive-include configs * \ No newline at end of file +prune docs +recursive-include dlio_benchmark/configs *.yaml \ No newline at end of file diff --git a/README.md b/README.md index 2d474d4d..176ed327 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Deep Learning I/O (DLIO) Benchmark -![test status](https://github.com/argonne-lcf/dlio_benchmark/actions/workflows/python-package-conda.yml/badge.svg) +![test status](https://github.com/argonne-lcf/dlio_benchmark/actions/workflows/ci.yml/badge.svg) This README provides an abbreviated documentation of the DLIO code. Please refer to https://dlio-benchmark.readthedocs.io for full user documentation. @@ -22,7 +22,7 @@ dlio_benchmark ++workload.workflow.generate_data=True ```bash git clone https://github.com/argonne-lcf/dlio_benchmark cd dlio_benchmark/ -pip install .[dlio_profiler] +pip install .[pydftracer] ``` ## Container @@ -85,10 +85,10 @@ Finally, run the benchmark ```bash mpirun -np 8 dlio_benchmark workload=unet3d ``` -Finally, run the benchmark with Profiler +Finally, run the benchmark with Tracer ```bash - export DLIO_PROFILER_ENABLE=1 - export DLIO_PROFILER_INC_METADATA=1 + export DFTRACER_ENABLE=1 + export DFTRACER_INC_METADATA=1 mpirun -np 8 dlio_benchmark workload=unet3d ``` diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index d6c1bd55..00000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,62 +0,0 @@ -# Use cpu version of torch ---extra-index-url https://download.pytorch.org/whl/cpu ---extra-index-url https://developer.download.nvidia.com/compute/redist - -absl-py==1.3.0 -antlr4-python3-runtime==4.9.3 -astunparse==1.6.3 -cachetools==5.2.0 -certifi==2022.9.24 -charset-normalizer==2.1.1 -flatbuffers==22.10.26 -gast==0.4.0 -google-auth==2.14.1 -google-auth-oauthlib==0.4.6 -google-pasta==0.2.0 -grpcio==1.51.0 -h5py==3.7.0 -hydra-core==1.2.0 -idna==3.4 -keras==2.11.0 -libclang==14.0.6 -Markdown==3.4.1 -MarkupSafe==2.1.1 -mpi4py==3.1.4 -numpy==1.23.5 -oauthlib==3.2.2 -omegaconf==2.2.3 -opt-einsum==3.3.0 -packaging==21.3 -pandas==1.5.1 -Pillow==9.3.0 -protobuf==3.19.6 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pyparsing==3.0.9 -python-dateutil==2.8.2 -pytz==2022.6 -PyYAML==6.0 -requests==2.28.1 -requests-oauthlib==1.3.1 -rsa==4.9 -six==1.16.0 -tensorboard==2.11.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.1 -tensorflow==2.11.0 -tensorflow-io==0.28.0 -tensorflow-estimator==2.11.0 -termcolor==2.1.1 -torch==1.13.0 -torchaudio==0.13.0 -torchvision==0.14.0 -typing_extensions==4.4.0 -urllib3==1.26.12 -Werkzeug==2.2.2 -wrapt==1.14.1 -pytest -pytest-mpi -pytest-subtests -pytest-timeout -nvidia-dali-cuda110 -psutil \ No newline at end of file diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 2c605914..2101d61d 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -184,7 +184,7 @@ class LoggerType(Enum): Logger types supported by the benchmark. """ DEFAULT = 'default' - DLIO_PROFILER = 'dlio_profiler' + DFTRACER = 'dftracer' def __str__(self): return self.value diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index e7f2970c..3755a23d 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -13,7 +13,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import PerfTrace, Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index e72559ef..c18cdbbe 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -60,7 +60,7 @@ def worker_init(self, worker_id): pickle.loads(self.serial_args) _args = ConfigArguments.get_instance() _args.configure_dlio_logging(is_child=True) - self.dlp_logger = _args.configure_dlio_profiler(is_child=True, use_pid=True) + self.dlp_logger = _args.configure_dftracer(is_child=True, use_pid=True) logging.debug(f"{utcnow()} worker initialized {worker_id} with format {self.format_type}") self.reader = ReaderFactory.get_reader(type=self.format_type, dataset_type=self.dataset_type, diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 7d7b6fb7..2246edbf 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -94,7 +94,7 @@ def __init__(self, cfg): self.comm.barrier() # Configure the logging library self.args.configure_dlio_logging(is_child=False) - self.dlio_profiler = self.args.configure_dlio_profiler(is_child=False, use_pid=False) + self.dftracer = self.args.configure_dftracer(is_child=False, use_pid=False) with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK): if self.args.my_rank == 0: logging.info(f"{utcnow()} Running DLIO with {self.args.comm_size} process(es)") @@ -381,7 +381,7 @@ def finalize(self): self.stats.finalize() self.stats.save_data() self.comm.barrier() - self.args.finalize_dlio_profiler(self.dlio_profiler) + self.args.finalize_dftracer(self.dftracer) @hydra.main(version_base=None, config_path="configs", config_name="config") diff --git a/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py b/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py index 68b4fbaf..6d5bd2bd 100644 --- a/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py +++ b/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py @@ -18,7 +18,7 @@ import torch from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.constants import MODULE_CHECKPOINT from dlio_benchmark.common.enumerations import CheckpointLocationType diff --git a/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py b/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py index e7c17402..c30ea77a 100644 --- a/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py +++ b/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py @@ -9,7 +9,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py b/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py index 857b3b24..9da296f5 100644 --- a/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py +++ b/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py @@ -19,7 +19,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) @@ -58,4 +58,4 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() \ No newline at end of file + return super().finalize() diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index aee202d4..6876610b 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -25,7 +25,7 @@ from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import PerfTrace, Profile dlp = Profile(MODULE_DATA_READER) @@ -95,4 +95,4 @@ def is_index_based(self): return False def is_iterator_based(self): - return True \ No newline at end of file + return True diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index e915a023..6091f360 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -25,7 +25,7 @@ from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import PerfTrace, Profile dlp = Profile(MODULE_DATA_READER) @@ -99,4 +99,4 @@ def is_index_based(self): return False def is_iterator_based(self): - return True \ No newline at end of file + return True diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index 5bf8b3a9..99132188 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -28,7 +28,7 @@ from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import PerfTrace, Profile dlp = Profile(MODULE_DATA_READER) @@ -107,4 +107,4 @@ def is_index_based(self): return False def is_iterator_based(self): - return True \ No newline at end of file + return True diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 73042155..fe1a42cf 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -33,7 +33,7 @@ import math import os import numpy as np -from dlio_benchmark.utils.utility import Profile, PerfTrace, DLIO_PROFILER_ENABLE +from dlio_benchmark.utils.utility import Profile, PerfTrace, DFTRACER_ENABLE dlp = Profile(MODULE_CONFIG) @dataclass @@ -138,6 +138,7 @@ class ConfigArguments: data_loader_class = None reader_class = None checkpoint_mechanism_class = None + native_data_loader = False def __init__(self): """ Virtually private constructor. """ @@ -178,12 +179,12 @@ def configure_dlio_logging(self, is_child=False): # logging's max timestamp resolution is msecs, we will pass in usecs in the message ) - def configure_dlio_profiler(self, is_child=False, use_pid=False): + def configure_dftracer(self, is_child=False, use_pid=False): # with "multiprocessing_context=fork" the profiler file remains open in the child process if is_child and self.multiprocessing_context == "fork": return # Configure the profiler - if DLIO_PROFILER_ENABLE: + if DFTRACER_ENABLE: dlp_trace = get_trace_name(self.output_folder, use_pid) if DLIOMPI.get_instance().rank() == 0: logging.info(f"{utcnow()} Profiling DLIO {dlp_trace}") @@ -195,8 +196,8 @@ def configure_dlio_profiler(self, is_child=False, use_pid=False): process_id=self.my_rank) return None - def finalize_dlio_profiler(self, dlp_logger): - if DLIO_PROFILER_ENABLE and dlp_logger: + def finalize_dftracer(self, dlp_logger): + if DFTRACER_ENABLE and dlp_logger: dlp_logger.finalize() @dlp.log @@ -300,6 +301,13 @@ def derive_configurations(self, file_list_train=None, file_list_eval=None): logging.info(f"Discovered custom data reader {class_name}") self.reader_class = obj break + self.native_data_loader = False + if self.data_loader == DataLoaderType.TENSORFLOW: + if self.format == FormatType.TFRECORD: + self.native_data_loader = True + elif self.data_loader == DataLoaderType.NATIVE_DALI: + if self.format in [FormatType.JPEG, FormatType.PNG, FormatType.NPY, FormatType.TFRECORD]: + self.native_data_loader = True @dlp.log def build_sample_map_iter(self, file_list, total_samples, epoch_number): @@ -369,18 +377,20 @@ def reconfigure(self, epoch_number, dataset_type): np.random.seed(self.seed) np.random.shuffle(self.file_list_train) if dataset_type is DatasetType.TRAIN else np.random.shuffle( self.file_list_eval) - if self.data_loader_sampler == DataLoaderSampler.ITERATIVE: - if dataset_type is DatasetType.TRAIN: - global_file_map = self.build_sample_map_iter(self.file_list_train, self.total_samples_train, - epoch_number) - else: - global_file_map = self.build_sample_map_iter(self.file_list_eval, self.total_samples_eval, epoch_number) - self.file_map = global_file_map[self.my_rank] - elif self.data_loader_sampler == DataLoaderSampler.INDEX: - if dataset_type is DatasetType.TRAIN: - self.global_index_map = self.get_global_map_index(self.file_list_train, self.total_samples_train) - else: - self.global_index_map = self.get_global_map_index(self.file_list_eval, self.total_samples_eval) + # the code assumes that file and sample shuffling is handled by the native data loader code. + if not self.native_data_loader: + if self.data_loader_sampler == DataLoaderSampler.ITERATIVE: + if dataset_type is DatasetType.TRAIN: + global_file_map = self.build_sample_map_iter(self.file_list_train, self.total_samples_train, + epoch_number) + else: + global_file_map = self.build_sample_map_iter(self.file_list_eval, self.total_samples_eval, epoch_number) + self.file_map = global_file_map[self.my_rank] + elif self.data_loader_sampler == DataLoaderSampler.INDEX: + if dataset_type is DatasetType.TRAIN: + self.global_index_map = self.get_global_map_index(self.file_list_train, self.total_samples_train) + else: + self.global_index_map = self.get_global_map_index(self.file_list_eval, self.total_samples_eval) def LoadConfig(args, config): diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index ed5fbd5e..8872f2ec 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -34,7 +34,7 @@ # UTC timestamp format with microsecond precision from dlio_benchmark.common.enumerations import LoggerType, MPIState try: - from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE + from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE except: class Profile(object): def __init__(self, name=None, cat=None): @@ -51,15 +51,15 @@ def __exit__(self, type, value, traceback): return def update(self, *, epoch=0, step=0, size=0, default=None): return - class dlio_logger(object): + class dftracer(object): def __init__(self,): self.type = None def initialize_log(self, logfile=None, data_dir=None, process_id=-1): return def iter(self, a): return a - PerfTrace = dlio_logger() - DLIO_PROFILER_ENABLE = False + PerfTrace = dftracer() + DFTRACER_ENABLE = False LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" diff --git a/docs/source/config.rst b/docs/source/config.rst index b8c6f25e..79ee1717 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -378,7 +378,7 @@ profiling .. note:: We support multi-level profiling using: - * ``dlio_profiler``: https://github.com/hariharan-devarajan/dlio-profiler. DLIO_PROFILER_ENABLE=1 has to be set to enable profiler. + * ``dftracer``: https://github.com/hariharan-devarajan/dftracer. DFTRACER_ENABLE=1 has to be set to enable profiler. Please refer to :ref:`profiling` on how to enable these profiling tools. How to create a DLIO configuration YAML file diff --git a/docs/source/index.rst b/docs/source/index.rst index decab989..100bd624 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -7,7 +7,7 @@ Deep Learning I/O (`DLIO`) Benchmark is a benchmark suite aiming at emulating th The main features of `DLIO` include: * Easy-to-use configuration through YAML files which represent the I/O process of different deep learing applications. * Easy-to-use data generator capable to generate synthetic datasets of different formats, different data organizations and layouts. - * Full transparency over emulation of I/O access with logging and profiling at different levels with DLIO profiler. + * Full transparency over emulation of I/O access with logging and profiling at different levels with DFTracer. * Supporting emulating both sequential training and distributed data parallel training. GitHub repo: https://github.com/argonne-lcf/dlio_benchmark. diff --git a/docs/source/instructions_lassen.rst b/docs/source/instructions_lassen.rst index ba043e0d..a1cdd2ca 100644 --- a/docs/source/instructions_lassen.rst +++ b/docs/source/instructions_lassen.rst @@ -112,11 +112,11 @@ Running the Benchmark jsrun --bind packed:4 --smpiargs="-gpu" --nrs 1 --rs_per_host 1 --tasks_per_rs 4 --launch_distribution packed --cpu_per_rs ALL_CPUS --gpu_per_rs ALL_GPUS dlio_benchmark workload=resnet50 ++workload.workflow.generate_data=False ++workload.workflow.train=True -If you want to use a profiler: Same example with using DLIO Profiler, isting the io devices you would like to trace: +If you want to use a profiler: Same example with using DFTracer, isting the io devices you would like to trace: .. code-block:: bash - export DLIO_PROFILER_ENABLE=1 + export DFTRACER_ENABLE=1 jsrun --bind packed:4 --smpiargs="-gpu" --nrs 1 --rs_per_host 1 --tasks_per_rs 4 --launch_distribution packed --cpu_per_rs ALL_CPUS --gpu_per_rs ALL_GPUS dlio_benchmark workload=resnet50 ++workload.workflow.generate_data=False ++workload.workflow.profiling=True All the outputs will be stored in ```hydra_log/WORKLOAD/$DATE-$TIME``` folder, where WORKLOAD could be `cosmoflow` etc or in our examples resnet50 if you are using the existing workloads. If you are using a custom workload this will be in the absolute path that you specified in your .yaml file. diff --git a/docs/source/profiling.rst b/docs/source/profiling.rst index 33106d10..ce35a55b 100644 --- a/docs/source/profiling.rst +++ b/docs/source/profiling.rst @@ -2,7 +2,7 @@ Profiling ========================== -We have a built in support for iostat and DLIO profiler for I/O profiling. Below are instructions on how to use the two profiling tools in `DLIO`. +We have a built in support for iostat and DFTracer for I/O profiling. Below are instructions on how to use the two profiling tools in `DLIO`. iostat profiling --------------------- @@ -279,28 +279,28 @@ The output is ... -DLIO profiler +DFTracer -------------------------- -ttps://github.com/hariharan-devarajan/dlio-profiler. A profiler developed for capturing I/O calls. If DLIO profiler is enabled, profiling trace will be generated at the end of the run. The profiler provides profiling information at both application levels and system I/O calls level. +ttps://github.com/hariharan-devarajan/dftracer. A profiler developed for capturing I/O calls. If DFTracer is enabled, profiling trace will be generated at the end of the run. The profiler provides profiling information at both application levels and system I/O calls level. -To enable this functionality, one has to install DLIO profiler throught +To enable this functionality, one has to install DFTracer throught .. code-block:: bash - pip install dlio-profiler + pip install dftracer or .. code-block:: bash - git clone git@github.com:hariharan-devarajan/dlio-profiler.git - cd dlio-profiler + git clone git@github.com:hariharan-devarajan/dftracer.git + cd dftracer python setup.py build python setup.py install -Then set ```DLIO_PROFILER_ENABLE=1``` to enable it. Other environemnt variables setting can be found here: https://dlio-profiler.readthedocs.io/en/latest/api.html#configurations-of-dlio-profiler. +Then set ```DFTRACER_ENABLE=1``` to enable it. Other environemnt variables setting can be found here: https://dftracer.readthedocs.io/en/latest/api.html#configurations-of-dftracer. The profiler outputs all profiling output in /.trace*.pfw files. It contains application level profiling as well as low-level I/O calls from POSIX and STDIO layers. diff --git a/docs/source/run.rst b/docs/source/run.rst index 6a6860ee..c1569e24 100644 --- a/docs/source/run.rst +++ b/docs/source/run.rst @@ -75,19 +75,19 @@ These files are in chrome tracing's json line format. This can be visualized usi Full Stack Profiling ''''''''''''''''''''' -DLIO_Benchmark has a optional full stack profiler called `dlio-profiler https://github.com/hariharan-devarajan/dlio-profiler`_. +DLIO_Benchmark has a optional full stack profiler called `dftracer https://github.com/hariharan-devarajan/dftracer`_. Installing Profiler ******************* -Installing just dlio-profiler +Installing just dftracer .. code-block:: bash - pip install git+https://github.com/hariharan-devarajan/dlio-profiler.git@dev + pip install git+https://github.com/hariharan-devarajan/dftracer.git@dev -DLIO Profiler is always installed along with dlio_benchmark +DFTracer is always installed along with dlio_benchmark .. code-block:: bash diff --git a/pyproject.toml b/pyproject.toml index 66900c39..2570ed92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,12 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.pytest] +timeout = 3000 + [tool.pytest.ini_options] log_cli = true log_cli_level = "INFO" log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" log_cli_date_format = "%Y-%m-%d %H:%M:%S" -[pytest] -timeout = 3000 -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 537a6552..3374aeb7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,57 +1,18 @@ -absl-py>=1.3.0 -antlr4-python3-runtime>=4.9.3 -astunparse>=1.6.3 -cachetools>=5.2.0 -certifi>=2022.9.24 -charset-normalizer>=2.1.1 -dlio_profiler_py==0.0.5 -flatbuffers>=23.5.26 -gast>=0.4.0 -google-auth>=2.14.1 -google-auth-oauthlib>=0.7.0 -google-pasta>=0.2.0 -grpcio>=1.51.0 -h5py>=3.7.0 -hydra-core>=1.2.0 -idna>=3.4 -keras>=2.15.0 -libclang>=14.0.6 -Markdown>=3.4.1 -MarkupSafe>=2.1.1 -mpi4py>=3.1.4 -numpy>=1.23.5 +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://developer.download.nvidia.com/compute/redist + +Pillow~=9.3.0 +PyYAML~=6.0.0 +hydra-core==1.3.2 +mpi4py~=3.1.4 +numpy~=1.23.5 nvidia-dali-cuda110>=1.34.0 -oauthlib>=3.2.2 -omegaconf>=2.2.3 -opt-einsum>=3.3.0 -packaging>=21.3 -pandas>=1.5.1 -Pillow>=9.3.0 -protobuf>=4.23.4 -psutil>=5.9.8 -pyasn1>=0.4.8 -pyasn1-modules>=0.2.8 -pyparsing>=3.0.9 -python-dateutil>=2.8.2 -pytz>=2022.6 -PyYAML>=6.0 -requests>=2.28.1 -requests-oauthlib>=1.3.1 -rsa>=4.9 -six>=1.16.0 -tensorboard>=2.11.0 -tensorboard-data-server>=0.7.2 -tensorboard-plugin-wit>=1.8.1 +omegaconf~=2.2.0 +pandas~=1.5.1 +psutil~=5.9.8 +pydftracer==1.0.2 +pytest tensorflow>=2.11.0 -tensorflow-io>=0.28.0 -tensorflow-estimator>=2.11.0 -termcolor>=2.1.1 -# Use cpu version of torch ---extra-index-url https://download.pytorch.org/whl/cpu torch>=2.2.0 -torchaudio>=2.2.0 -torchvision>=0.17.0 -typing_extensions>=4.9.0 -urllib3>=1.26.12 -Werkzeug>=2.2.2 -wrapt>=1.14.1 -psutil>=5.9.5 +torch>=2.2.0 +torchaudio +torchvision diff --git a/setup.py b/setup.py index 6e97063e..49e0176d 100644 --- a/setup.py +++ b/setup.py @@ -1,56 +1,58 @@ -from setuptools import setup, find_namespace_packages -from glob import glob from distutils import util -configs = glob('dlio_benchmark/configs/**/*', recursive=True) -print(configs) -import pathlib, pkg_resources -import os -os.system("python -m pip install -r requirements.txt") +from setuptools import find_namespace_packages, setup +import pathlib + +HYDRA_VERSION = "1.3.2" + test_deps = [ - 'pytest', + "pytest", ] core_deps = [ - 'mpi4py>=3.1.4', - 'numpy>=1.23.5', - 'h5py>=3.7.0', - 'pandas>=1.5.1', - 'psutil', - 'dlio_profiler_py==0.0.3' + "Pillow~=9.3.0", + "PyYAML~=6.0.0", + "h5py~=3.11.0", + "mpi4py~=3.1.4", + "numpy~=1.23.5", + "omegaconf~=2.2.0", + "pandas~=1.5.1", + "psutil~=5.9.8", + "pydftracer==1.0.2", ] x86_deps = [ - 'hydra-core >= 1.2.0', - 'tensorflow >= 2.11', - 'torch >= 2.2', - 'torchaudio', - 'torchvision', - 'nvidia-dali-cuda110' + f"hydra-core=={HYDRA_VERSION}", + "nvidia-dali-cuda110>=1.34.0", + "tensorflow>=2.11.0", + "torch>=2.2.0", + "torchaudio", + "torchvision", ] ppc_deps = [ - 'hydra-core @ git+https://github.com/facebookresearch/hydra.git@v1.3.2#egg=hydra-core' + f"hydra-core @ git+https://github.com/facebookresearch/hydra.git@v{HYDRA_VERSION}#egg=hydra-core" ] deps = core_deps if "ppc" in util.get_platform(): - deps.extend(ppc_deps) + deps.extend(ppc_deps) else: - deps.extend(x86_deps) -print(deps) + deps.extend(x86_deps) + extras = { - 'test': test_deps, + "test": test_deps, } -import pathlib + here = pathlib.Path(__file__).parent.resolve() long_description = (here / "README.md").read_text(encoding="utf-8") + setup( - name='dlio_benchmark', - version='2.0', + name="dlio_benchmark", + version="2.0.0", description="An I/O benchmark for deep Learning applications", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/argonne-lcf/dlio_benchmark", author="Huihuo Zheng, Hariharan Devarajan (Hari)", - email="zhenghh04@gmail.com, mani.hariharan@gmail.com", + author_email="zhenghh04@gmail.com, mani.hariharan@gmail.com", classifiers=[ # Optional # How mature is this project? Common values are # 3 - Alpha @@ -58,10 +60,10 @@ # 5 - Production/Stable "Development Status :: 5 - Production/Stable", # Indicate who your project is intended for - "Intended Audience :: HPC", + "Intended Audience :: Science/Research", "Topic :: Software Development :: Build Tools", # Pick your license as you wish - "License :: OSI Approved :: Apache 2.0 License", + "License :: OSI Approved :: Apache Software License", # Specify the Python versions you support here. In particular, ensure # that you indicate you support Python 3. These classifiers are *not* # checked by 'pip install'. See instead 'python_requires' below. @@ -69,6 +71,8 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", ], keywords="deep learning, I/O, benchmark, NPZ, pytorch benchmark, tensorflow benchmark", @@ -79,22 +83,23 @@ # Main package definition packages=find_namespace_packages(where="."), package_dir={"dlio_benchmark": "dlio_benchmark"}, - package_data={'dlio_benchmark.configs': ['*.yaml'], - 'dlio_benchmark.configs.hydra.help': ['*.yaml'], - 'dlio_benchmark.configs.hydra.job_logging': ['*.yaml'], - 'dlio_benchmark.configs.workload': ['*.yaml'], - }, + package_data={ + "dlio_benchmark.configs": ["*.yaml"], + "dlio_benchmark.configs.hydra.help": ["*.yaml"], + "dlio_benchmark.configs.hydra.job_logging": ["*.yaml"], + "dlio_benchmark.configs.workload": ["*.yaml"], + }, dependency_links=[ - 'https://download.pytorch.org/whl/cpu', - 'https://developer.download.nvidia.com/compute/redist' + "https://download.pytorch.org/whl/cpu", + "https://developer.download.nvidia.com/compute/redist", ], install_requires=deps, tests_require=test_deps, extras_require=extras, entry_points={ - 'console_scripts': [ - 'dlio_benchmark = dlio_benchmark.main:main', - 'dlio_postprocessor = dlio_benchmark.postprocessor:main', + "console_scripts": [ + "dlio_benchmark = dlio_benchmark.main:main", + "dlio_postprocessor = dlio_benchmark.postprocessor:main", ] - } + }, )