diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
new file mode 100644
index 00000000..e834b4a2
--- /dev/null
+++ b/.github/workflows/cd.yml
@@ -0,0 +1,53 @@
+name: Release
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  release-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+
+      - name: Build release distributions
+        run: |
+          # NOTE: put your own distribution build steps here.
+          python -m pip install build
+          python -m build
+
+      - name: Upload distributions
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+
+    needs:
+      - release-build
+
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Retrieve release distributions
+        uses: actions/download-artifact@v4
+        with:
+          name: release-dists
+          path: dist/
+
+      - name: Publish release distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_DLIO_TOKEN }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..8fe5ce04
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,241 @@
+name: Build and Test
+
+on:
+  pull_request:
+    branches: [main, dev]
+  push:
+
+jobs:
+  build-and-test:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04]
+        gcc: [10]
+        python: ["3.9", "3.10", "3.11"]
+        venv: ["via-setup", "via-reqs"]
+    name: ${{ matrix.os }}-${{ matrix.gcc }}-${{ matrix.python }}-${{ matrix.venv }}
+    runs-on: ${{ matrix.os }}
+    env:
+      CC: gcc-${{ matrix.gcc }}
+      CXX: g++-${{ matrix.gcc }}
+      DFTRACER_BUILD_TYPE: "Debug"
+      DFTRACER_ENABLE: 1
+      DFTRACER_LOG_LEVEL: "DEBUG"
+      DLIO_EXEC: ${{ matrix.venv == 'via-setup' && 'dlio_benchmark' || 'python dlio_benchmark/main.py' }}
+      GOTCHA_DEBUG: 3
+      OMPI_ALLOW_RUN_AS_ROOT: 1
+      OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1
+      PYTHON_VER: ${{ matrix.python }}
+      RDMAV_FORK_SAFE: "1"
+      VENV_PATH: "/home/runner/work/.venv/${{ matrix.venv }}"
+    steps:
+      - name: Clear disc
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Push checkout
+        if: github.event_name == 'push'
+        uses: actions/checkout@v3
+      - name: PR checkout
+        if: github.event_name == 'pull_request'
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+      - name: Set up Python ${{ matrix.python }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Add current directory to PYTHONPATH
+        if: matrix.venv == 'via-reqs'
+        run: echo "PYTHONPATH=$(pwd):$PYTHONPATH" >> $GITHUB_ENV
+      - name: Cache install modules
+        id: cache-modules
+        uses: actions/cache@v3
+        with:
+          path: ${{ env.VENV_PATH }}
+          key: ${{ matrix.venv }}-gcc${{ matrix.gcc }}-python${{ matrix.python }}-${{ hashFiles('requirements.txt', 'setup.py') }}
+      - name: Install system dependencies
+        run: |
+          sudo apt update
+          sudo apt-get install -y $CC $CXX libc6 git
+          sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev python3-dev
+      - name: Install DLIO via setup.py
+        if: matrix.venv == 'via-setup' && steps.cache-modules.outputs.cache-hit != 'true'
+        run: |
+          echo "venv: ${VENV_PATH} - gcc: $CC"
+          python -m venv ${VENV_PATH}
+          source ${VENV_PATH}/bin/activate
+          pip install --upgrade pip
+          pip install .[test]
+      - name: Install DLIO via requirements.txt
+        if: matrix.venv == 'via-reqs' && steps.cache-modules.outputs.cache-hit != 'true'
+        run: |
+          echo "venv: ${VENV_PATH} - gcc: $CC"
+          python -m venv ${VENV_PATH}
+          source ${VENV_PATH}/bin/activate
+          pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: test_gen_data
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v
+          rm -rf data
+      - name: test_custom_storage_root_gen_data
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v
+          rm -rf data
+      - name: test_train
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v
+          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v
+          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v
+          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v
+          mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v
+          rm -rf data
+      - name: test_custom_storage_root_train
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v
+          mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v
+          rm -rf data
+      - name: test_checkpoint_epoch
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v
+          mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v
+          rm -rf data
+      - name: test_checkpoint_step
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_checkpoint_step -v
+      - name: test_eval
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_eval -v
+      - name: test_multi_threads
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_multi_threads[tensorflow-0]  -v
+          mpirun -np 2 pytest -k test_multi_threads[tensorflow-1]  -v
+          mpirun -np 2 pytest -k test_multi_threads[tensorflow-2]  -v
+          mpirun -np 2 pytest -k test_multi_threads[pytorch-0]  -v
+          mpirun -np 2 pytest -k test_multi_threads[pytorch-1]  -v
+          mpirun -np 2 pytest -k test_multi_threads[pytorch-2]  -v
+          rm -rf data
+      - name: test-pytorch-multiprocessing-context
+        run: |
+          source ${VENV_PATH}/bin/activate
+          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v
+          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v
+          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v
+          mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v
+          rm -rf data
+      - name: test_subset
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 pytest -k test_subset -v
+          rm -rf data
+      - name: test-tf-loader-tfrecord
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True  ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16
+          mpirun -np 2 ${DLIO_EXEC} workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False  ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1
+          rm -rf data
+      - name: test-torch-loader-npz
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          rm -rf data
+      - name: test-tf-loader-npz
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          rm -rf data
+      - name: test_unet3d
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42
+          mpirun -np 2 ${DLIO_EXEC} workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic
+          rm -rf data
+      - name: test_resnet50
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4
+          mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4
+          mpirun -np 2 ${DLIO_EXEC} workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic
+          rm -rf data
+      - name: test_cosmoflow
+        run: |
+          source ${VENV_PATH}/bin/activate
+          rm -rf output data checkpoints
+          mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16
+          mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16
+          mpirun -np 2 ${DLIO_EXEC} workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic
+          rm -rf data
diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml
index 07173e34..bdb2ab26 100644
--- a/.github/workflows/jekyll-gh-pages.yml
+++ b/.github/workflows/jekyll-gh-pages.yml
@@ -1,5 +1,5 @@
 # Sample workflow for building and deploying a Jekyll site to GitHub Pages
-name: Deploy Jekyll with GitHub Pages dependencies preinstalled
+name: Deploy Documentation
 
 on:
   # Runs on pushes targeting the default branch
@@ -51,5 +51,5 @@ jobs:
       - name: Deploy to GitHub Pages
         id: deployment
         uses: actions/deploy-pages@v1
-        with: 
+        with:
           folder: _build/html/
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
deleted file mode 100644
index 78012e0e..00000000
--- a/.github/workflows/python-package-conda.yml
+++ /dev/null
@@ -1,234 +0,0 @@
-name: Python Package using Conda
-
-on:
-  pull_request:
-    branches: [ main, dev ]
-  push:
-  
-jobs:
-  build-and-test:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-20.04, ubuntu-22.04 ]
-        profiler: [ 0, 1 ]
-        gcc: [10]
-        python: ["3.8", "3.9", "3.10" ]
-    name: ${{ matrix.os }}-${{ matrix.profiler }}-${{ matrix.gcc }}-${{ matrix.python }}
-    runs-on: ${{ matrix.os }}
-    env:
-      VENV: "/home/runner/work/venv"
-      DLIO_PROFILER_ENABLE: ${{ matrix.profiler }}
-      CC: gcc-${{ matrix.gcc }}
-      CXX: g++-${{ matrix.gcc }}
-      RDMAV_FORK_SAFE: "1"
-      PYTHON_VER: ${{ matrix.python }}
-      DLIO_PROFILER_LOG_LEVEL: "INFO"
-      GOTCHA_DEBUG: 3
-    steps:
-    - name: clear disc
-      run: |
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf "/usr/local/share/boost"
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-    - name: Push checkout
-      if: github.event_name == 'push'
-      uses: actions/checkout@v3
-    - name: PR checkout
-      if: github.event_name == 'pull_request'
-      uses: actions/checkout@v3
-      with:
-        ref: ${{ github.event.pull_request.head.sha }}
-    - name: Set up Python ${{ matrix.python }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python }}
-    - name: Cache install modules
-      id: cache-modules
-      uses: actions/cache@v3
-      with:
-        path: ${{ env.VENV }}
-        key: ${{env.VENV }}-${{env.DLIO_PROFILER}}-${{ matrix.gcc }}-${{ matrix.python }}-${{ hashFiles('setup.py') }}
-    - name: Install System Tools
-      run: |
-        sudo apt update
-        sudo apt-get install $CC $CXX libc6 git
-        sudo apt-get install mpich libhwloc-dev
-    - name: Install DLIO code only 
-      if: steps.cache-modules.outputs.cache-hit == 'true'    
-      run: |
-        source ${VENV}/bin/activate
-        rm -rf *.egg*
-        rm -rf build
-        rm -rf dist
-        pip uninstall -y dlio_benchmark
-        python setup.py build
-        python setup.py install
-    - name: Install DLIO
-      if: steps.cache-modules.outputs.cache-hit != 'true'
-      run: |
-        echo "Profiler ${DLIO_PROFILER} gcc $CC"
-        python -m pip install --upgrade pip
-        pip install virtualenv
-        python -m venv ${VENV}
-        source ${VENV}/bin/activate
-        pip install .[test]       
-    - name: Install DLIO Profiler
-      run: |
-        echo "Profiler ${DLIO_PROFILER} gcc $CC"
-        source ${VENV}/bin/activate
-        pip install --force-reinstall dlio_profiler_py
-    - name: test_gen_data
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v
-        mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v
-        rm -rf data
-    - name: test_custom_storage_root_gen_data
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_storage_root_gen_data[png-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[npz-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v
-        mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v
-        rm -rf data
-    - name: test_train
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[jpeg-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[tfrecord-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[hdf5-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[csv-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[png-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[npz-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[jpeg-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[hdf5-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[csv-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[png-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[npz-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[jpeg-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[hdf5-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[csv-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[png-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[npz-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v
-        mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v
-        mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v
-        mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v
-        mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v
-        rm -rf data
-    - name: test_custom_storage_root_train
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_custom_storage_root_train[png-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[npz-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[tfrecord-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[csv-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[png-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[npz-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v
-        mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v
-        rm -rf data
-    - name: test_checkpoint_epoch
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v
-        mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v
-        mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v
-        mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v
-        mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v
-        mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v
-        rm -rf data
-    - name: test_checkpoint_step
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_checkpoint_step -v
-    - name: test_eval
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_eval -v
-    - name: test_multi_threads
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_multi_threads[tensorflow-0]  -v
-        mpirun -np 2 pytest -k test_multi_threads[tensorflow-1]  -v
-        mpirun -np 2 pytest -k test_multi_threads[tensorflow-2]  -v
-        mpirun -np 2 pytest -k test_multi_threads[pytorch-0]  -v
-        mpirun -np 2 pytest -k test_multi_threads[pytorch-1]  -v
-        mpirun -np 2 pytest -k test_multi_threads[pytorch-2]  -v
-        rm -rf data
-    - name: test-pytorch-multiprocessing-context
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[0-None] -v
-        mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[1-fork] -v
-        mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-forkserver] -v
-        mpirun -np 2 pytest -k test_pytorch_multiprocessing_context[2-spawn] -v
-        rm -rf data
-    - name: test-tf-loader-tfrecord
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True  ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16
-        mpirun -np 2 dlio_benchmark workload=resnet50_tf ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False  ++workload.dataset.num_files_train=4 ++workload.dataset.num_samples_per_file=16 ++workload.train.computation_time=0.01 ++workload.train.epochs=1
-        rm -rf data
-    - name: test-torch-loader-npz
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=8 ++workload.dataset.num_files_eval=8 ++workload.reader.read_threads=0  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        rm -rf data
-    - name: test-tf-loader-npz
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        rm -rf data
-    - name: test_subset
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 pytest -k test_subset -v 
-    - name: test_unet3d
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=unet3d_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42
-        mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42
-        mpirun -np 2 dlio_benchmark workload=unet3d_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=42 ++workload.dataset.format=synthetic
-        rm -rf data
-    - name: test_resnet50
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=resnet50_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4
-        mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4
-        mpirun -np 2 dlio_benchmark workload=resnet50_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=4 ++workload.dataset.format=synthetic
-        rm -rf data
-    - name: test_cosmoflow
-      run: |
-        source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=cosmoflow_a100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16
-        mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16
-        mpirun -np 2 dlio_benchmark workload=cosmoflow_h100 ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.format=synthetic
-        rm -rf data
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 49623d51..3ee4b4c1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,2 @@
-include requirements.txt
-recursive-include configs *
\ No newline at end of file
+prune docs
+recursive-include dlio_benchmark/configs *.yaml
\ No newline at end of file
diff --git a/README.md b/README.md
index 2d474d4d..176ed327 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Deep Learning I/O (DLIO) Benchmark
-![test status](https://github.com/argonne-lcf/dlio_benchmark/actions/workflows/python-package-conda.yml/badge.svg)
+![test status](https://github.com/argonne-lcf/dlio_benchmark/actions/workflows/ci.yml/badge.svg)
 
 This README provides an abbreviated documentation of the DLIO code. Please refer to https://dlio-benchmark.readthedocs.io for full user documentation. 
 
@@ -22,7 +22,7 @@ dlio_benchmark ++workload.workflow.generate_data=True
 ```bash
 git clone https://github.com/argonne-lcf/dlio_benchmark
 cd dlio_benchmark/
-pip install .[dlio_profiler]
+pip install .[pydftracer]
 ```
 
 ## Container
@@ -85,10 +85,10 @@ Finally, run the benchmark
   ```bash
   mpirun -np 8 dlio_benchmark workload=unet3d
   ```
-Finally, run the benchmark with Profiler
+Finally, run the benchmark with Tracer
   ```bash
-  export DLIO_PROFILER_ENABLE=1
-  export DLIO_PROFILER_INC_METADATA=1
+  export DFTRACER_ENABLE=1
+  export DFTRACER_INC_METADATA=1
   mpirun -np 8 dlio_benchmark workload=unet3d
   ```
 
diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index d6c1bd55..00000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-# Use cpu version of torch
---extra-index-url https://download.pytorch.org/whl/cpu
---extra-index-url https://developer.download.nvidia.com/compute/redist
-
-absl-py==1.3.0
-antlr4-python3-runtime==4.9.3
-astunparse==1.6.3
-cachetools==5.2.0
-certifi==2022.9.24
-charset-normalizer==2.1.1
-flatbuffers==22.10.26
-gast==0.4.0
-google-auth==2.14.1
-google-auth-oauthlib==0.4.6
-google-pasta==0.2.0
-grpcio==1.51.0
-h5py==3.7.0
-hydra-core==1.2.0
-idna==3.4
-keras==2.11.0
-libclang==14.0.6
-Markdown==3.4.1
-MarkupSafe==2.1.1
-mpi4py==3.1.4
-numpy==1.23.5
-oauthlib==3.2.2
-omegaconf==2.2.3
-opt-einsum==3.3.0
-packaging==21.3
-pandas==1.5.1
-Pillow==9.3.0
-protobuf==3.19.6
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pyparsing==3.0.9
-python-dateutil==2.8.2
-pytz==2022.6
-PyYAML==6.0
-requests==2.28.1
-requests-oauthlib==1.3.1
-rsa==4.9
-six==1.16.0
-tensorboard==2.11.0
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-tensorflow==2.11.0
-tensorflow-io==0.28.0
-tensorflow-estimator==2.11.0
-termcolor==2.1.1
-torch==1.13.0
-torchaudio==0.13.0
-torchvision==0.14.0
-typing_extensions==4.4.0
-urllib3==1.26.12
-Werkzeug==2.2.2
-wrapt==1.14.1
-pytest
-pytest-mpi
-pytest-subtests
-pytest-timeout
-nvidia-dali-cuda110
-psutil
\ No newline at end of file
diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 2c605914..2101d61d 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -184,7 +184,7 @@ class LoggerType(Enum):
     Logger types supported by the benchmark.
     """
     DEFAULT = 'default'
-    DLIO_PROFILER = 'dlio_profiler'
+    DFTRACER = 'dftracer'
 
     def __str__(self):
         return self.value
diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
index e7f2970c..3755a23d 100644
--- a/dlio_benchmark/data_loader/native_dali_data_loader.py
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -13,7 +13,7 @@
 from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
 from dlio_benchmark.reader.reader_factory import ReaderFactory
 from dlio_benchmark.utils.utility import utcnow
-from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
+from dlio_benchmark.utils.utility import PerfTrace, Profile
 
 dlp = Profile(MODULE_DATA_LOADER)
 
diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py
index e72559ef..c18cdbbe 100644
--- a/dlio_benchmark/data_loader/torch_data_loader.py
+++ b/dlio_benchmark/data_loader/torch_data_loader.py
@@ -60,7 +60,7 @@ def worker_init(self, worker_id):
         pickle.loads(self.serial_args)
         _args = ConfigArguments.get_instance()
         _args.configure_dlio_logging(is_child=True)
-        self.dlp_logger = _args.configure_dlio_profiler(is_child=True, use_pid=True)
+        self.dlp_logger = _args.configure_dftracer(is_child=True, use_pid=True)
         logging.debug(f"{utcnow()} worker initialized {worker_id} with format {self.format_type}")
         self.reader = ReaderFactory.get_reader(type=self.format_type,
                                                dataset_type=self.dataset_type,
diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py
index 7d7b6fb7..2246edbf 100644
--- a/dlio_benchmark/main.py
+++ b/dlio_benchmark/main.py
@@ -94,7 +94,7 @@ def __init__(self, cfg):
         self.comm.barrier()
         # Configure the logging library
         self.args.configure_dlio_logging(is_child=False)
-        self.dlio_profiler = self.args.configure_dlio_profiler(is_child=False, use_pid=False)
+        self.dftracer = self.args.configure_dftracer(is_child=False, use_pid=False)
         with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK):
             if self.args.my_rank == 0:
                 logging.info(f"{utcnow()} Running DLIO with {self.args.comm_size} process(es)")
@@ -381,7 +381,7 @@ def finalize(self):
             self.stats.finalize()
             self.stats.save_data()
         self.comm.barrier()
-        self.args.finalize_dlio_profiler(self.dlio_profiler)
+        self.args.finalize_dftracer(self.dftracer)
 
 
 @hydra.main(version_base=None, config_path="configs", config_name="config")
diff --git a/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py b/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py
index 68b4fbaf..6d5bd2bd 100644
--- a/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py
+++ b/dlio_benchmark/plugins/experimental/src/checkpoint/pytorch_checkpointing.py
@@ -18,7 +18,7 @@
 import torch
 
 from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing
-from dlio_profiler.logger import fn_interceptor as Profile
+from dlio_benchmark.utils.utility import Profile
 
 from dlio_benchmark.common.constants import MODULE_CHECKPOINT
 from dlio_benchmark.common.enumerations import CheckpointLocationType
diff --git a/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py b/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py
index e7c17402..c30ea77a 100644
--- a/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py
+++ b/dlio_benchmark/plugins/experimental/src/data_loader/custom_torch_data_loader.py
@@ -9,7 +9,7 @@
 from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
 from dlio_benchmark.reader.reader_factory import ReaderFactory
 from dlio_benchmark.utils.utility import utcnow, DLIOMPI
-from dlio_profiler.logger import fn_interceptor as Profile
+from dlio_benchmark.utils.utility import Profile
 
 dlp = Profile(MODULE_DATA_LOADER)
 
diff --git a/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py b/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py
index 857b3b24..9da296f5 100644
--- a/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py
+++ b/dlio_benchmark/plugins/experimental/src/reader/custom_npz_reader.py
@@ -19,7 +19,7 @@
 from dlio_benchmark.common.constants import MODULE_DATA_READER
 from dlio_benchmark.reader.reader_handler import FormatReader
 
-from dlio_profiler.logger import fn_interceptor as Profile
+from dlio_benchmark.utils.utility import Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
@@ -58,4 +58,4 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
\ No newline at end of file
+        return super().finalize()
diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index aee202d4..6876610b 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -25,7 +25,7 @@
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
-from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
+from dlio_benchmark.utils.utility import PerfTrace, Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
@@ -95,4 +95,4 @@ def is_index_based(self):
         return False
 
     def is_iterator_based(self):
-        return True
\ No newline at end of file
+        return True
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index e915a023..6091f360 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -25,7 +25,7 @@
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
-from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
+from dlio_benchmark.utils.utility import PerfTrace, Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
@@ -99,4 +99,4 @@ def is_index_based(self):
         return False
 
     def is_iterator_based(self):
-        return True
\ No newline at end of file
+        return True
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index 5bf8b3a9..99132188 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -28,7 +28,7 @@
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
-from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
+from dlio_benchmark.utils.utility import PerfTrace, Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
@@ -107,4 +107,4 @@ def is_index_based(self):
         return False
 
     def is_iterator_based(self):
-        return True
\ No newline at end of file
+        return True
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index 73042155..fe1a42cf 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -33,7 +33,7 @@
 import math
 import os
 import numpy as np
-from dlio_benchmark.utils.utility import Profile, PerfTrace, DLIO_PROFILER_ENABLE
+from dlio_benchmark.utils.utility import Profile, PerfTrace, DFTRACER_ENABLE
 
 dlp = Profile(MODULE_CONFIG)
 @dataclass
@@ -138,6 +138,7 @@ class ConfigArguments:
     data_loader_class = None
     reader_class = None
     checkpoint_mechanism_class = None
+    native_data_loader = False
 
     def __init__(self):
         """ Virtually private constructor. """
@@ -178,12 +179,12 @@ def configure_dlio_logging(self, is_child=False):
             # logging's max timestamp resolution is msecs, we will pass in usecs in the message
         )
 
-    def configure_dlio_profiler(self, is_child=False, use_pid=False):
+    def configure_dftracer(self, is_child=False, use_pid=False):
         # with "multiprocessing_context=fork" the profiler file remains open in the child process
         if is_child and self.multiprocessing_context == "fork":
             return
         # Configure the profiler
-        if DLIO_PROFILER_ENABLE:
+        if DFTRACER_ENABLE:
             dlp_trace = get_trace_name(self.output_folder, use_pid)
             if DLIOMPI.get_instance().rank() == 0:
                 logging.info(f"{utcnow()} Profiling DLIO {dlp_trace}")
@@ -195,8 +196,8 @@ def configure_dlio_profiler(self, is_child=False, use_pid=False):
                                                    process_id=self.my_rank)
         return None
 
-    def finalize_dlio_profiler(self, dlp_logger):
-        if DLIO_PROFILER_ENABLE and dlp_logger:
+    def finalize_dftracer(self, dlp_logger):
+        if DFTRACER_ENABLE and dlp_logger:
             dlp_logger.finalize()
 
     @dlp.log
@@ -300,6 +301,13 @@ def derive_configurations(self, file_list_train=None, file_list_eval=None):
                         logging.info(f"Discovered custom data reader {class_name}")
                     self.reader_class = obj
                     break
+        self.native_data_loader = False
+        if self.data_loader == DataLoaderType.TENSORFLOW:
+            if self.format == FormatType.TFRECORD:
+                self.native_data_loader = True
+        elif self.data_loader == DataLoaderType.NATIVE_DALI:
+            if self.format in [FormatType.JPEG, FormatType.PNG, FormatType.NPY, FormatType.TFRECORD]:
+                self.native_data_loader = True
 
     @dlp.log
     def build_sample_map_iter(self, file_list, total_samples, epoch_number):
@@ -369,18 +377,20 @@ def reconfigure(self, epoch_number, dataset_type):
                     np.random.seed(self.seed)
                 np.random.shuffle(self.file_list_train) if dataset_type is DatasetType.TRAIN else np.random.shuffle(
                     self.file_list_eval)
-        if self.data_loader_sampler == DataLoaderSampler.ITERATIVE:
-            if dataset_type is DatasetType.TRAIN:
-                global_file_map = self.build_sample_map_iter(self.file_list_train, self.total_samples_train,
-                                                             epoch_number)
-            else:
-                global_file_map = self.build_sample_map_iter(self.file_list_eval, self.total_samples_eval, epoch_number)
-            self.file_map = global_file_map[self.my_rank]
-        elif self.data_loader_sampler == DataLoaderSampler.INDEX:
-            if dataset_type is DatasetType.TRAIN:
-                self.global_index_map = self.get_global_map_index(self.file_list_train, self.total_samples_train)
-            else:
-                self.global_index_map = self.get_global_map_index(self.file_list_eval, self.total_samples_eval)
+        # the code assumes that file and sample shuffling is handled by the native data loader code.
+        if not self.native_data_loader:
+            if self.data_loader_sampler == DataLoaderSampler.ITERATIVE:
+                if dataset_type is DatasetType.TRAIN:
+                    global_file_map = self.build_sample_map_iter(self.file_list_train, self.total_samples_train,
+                                                                epoch_number)
+                else:
+                    global_file_map = self.build_sample_map_iter(self.file_list_eval, self.total_samples_eval, epoch_number)
+                self.file_map = global_file_map[self.my_rank]
+            elif self.data_loader_sampler == DataLoaderSampler.INDEX:
+                if dataset_type is DatasetType.TRAIN:
+                    self.global_index_map = self.get_global_map_index(self.file_list_train, self.total_samples_train)
+                else:
+                    self.global_index_map = self.get_global_map_index(self.file_list_eval, self.total_samples_eval)
 
 
 def LoadConfig(args, config):
diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
index ed5fbd5e..8872f2ec 100644
--- a/dlio_benchmark/utils/utility.py
+++ b/dlio_benchmark/utils/utility.py
@@ -34,7 +34,7 @@
 # UTC timestamp format with microsecond precision
 from dlio_benchmark.common.enumerations import LoggerType, MPIState
 try:
-    from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE
+    from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
 except:
     class Profile(object):
         def __init__(self, name=None, cat=None):
@@ -51,15 +51,15 @@ def __exit__(self, type, value, traceback):
             return
         def update(self, *, epoch=0, step=0, size=0, default=None):
             return
-    class dlio_logger(object):
+    class dftracer(object):
         def __init__(self,):
             self.type = None
         def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
         def iter(self, a):
             return a
-    PerfTrace = dlio_logger()
-    DLIO_PROFILER_ENABLE = False
+    PerfTrace = dftracer()
+    DFTRACER_ENABLE = False
 
 LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f"
 
diff --git a/docs/source/config.rst b/docs/source/config.rst
index b8c6f25e..79ee1717 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -378,7 +378,7 @@ profiling
 .. note::
    
    We support multi-level profiling using:
-    * ``dlio_profiler``: https://github.com/hariharan-devarajan/dlio-profiler. DLIO_PROFILER_ENABLE=1 has to be set to enable profiler.
+    * ``dftracer``: https://github.com/hariharan-devarajan/dftracer. DFTRACER_ENABLE=1 has to be set to enable profiler.
     Please refer to :ref:`profiling` on how to enable these profiling tools. 
 
 How to create a DLIO configuration YAML file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index decab989..100bd624 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,7 +7,7 @@ Deep Learning I/O (`DLIO`) Benchmark is a benchmark suite aiming at emulating th
 The main features of `DLIO` include: 
    * Easy-to-use configuration through YAML files which represent the I/O process of different deep learing applications.
    * Easy-to-use data generator capable to generate synthetic datasets of different formats, different data organizations and layouts. 
-   * Full transparency over emulation of I/O access with logging and profiling at different levels with DLIO profiler.
+   * Full transparency over emulation of I/O access with logging and profiling at different levels with DFTracer.
    * Supporting emulating both sequential training and distributed data parallel training. 
 
 GitHub repo: https://github.com/argonne-lcf/dlio_benchmark. 
diff --git a/docs/source/instructions_lassen.rst b/docs/source/instructions_lassen.rst
index ba043e0d..a1cdd2ca 100644
--- a/docs/source/instructions_lassen.rst
+++ b/docs/source/instructions_lassen.rst
@@ -112,11 +112,11 @@ Running the Benchmark
 
 	jsrun --bind packed:4 --smpiargs="-gpu" --nrs 1 --rs_per_host 1 --tasks_per_rs 4 --launch_distribution packed --cpu_per_rs ALL_CPUS --gpu_per_rs ALL_GPUS dlio_benchmark workload=resnet50 ++workload.workflow.generate_data=False ++workload.workflow.train=True
 
-If you want to use a profiler: Same example with using DLIO Profiler, isting the io devices you would like to trace:
+If you want to use a profiler: Same example with using DFTracer, isting the io devices you would like to trace:
 
 .. code-block:: bash
 
-    export DLIO_PROFILER_ENABLE=1
+    export DFTRACER_ENABLE=1
 	jsrun --bind packed:4 --smpiargs="-gpu" --nrs 1 --rs_per_host 1 --tasks_per_rs 4 --launch_distribution packed --cpu_per_rs ALL_CPUS --gpu_per_rs ALL_GPUS dlio_benchmark workload=resnet50 ++workload.workflow.generate_data=False ++workload.workflow.profiling=True
 
 All the outputs will be stored in ```hydra_log/WORKLOAD/$DATE-$TIME``` folder, where WORKLOAD could be `cosmoflow` etc or in our examples resnet50 if you are using the existing workloads. If you are using a custom workload this will be in the absolute path that you specified in your .yaml file.
diff --git a/docs/source/profiling.rst b/docs/source/profiling.rst
index 33106d10..ce35a55b 100644
--- a/docs/source/profiling.rst
+++ b/docs/source/profiling.rst
@@ -2,7 +2,7 @@
 
 Profiling 
 ==========================
-We have a built in support for iostat and DLIO profiler for I/O profiling. Below are instructions on how to use the two profiling tools in `DLIO`. 
+We have a built in support for iostat and DFTracer for I/O profiling. Below are instructions on how to use the two profiling tools in `DLIO`. 
 
 iostat profiling
 ---------------------
@@ -279,28 +279,28 @@ The output is
         ...
 
 
-DLIO profiler
+DFTracer
 --------------------------
 
-ttps://github.com/hariharan-devarajan/dlio-profiler. A profiler developed for capturing I/O calls. If DLIO profiler is enabled, profiling trace will be generated at the end of the run. The profiler provides profiling information at both application levels and system I/O calls level. 
+ttps://github.com/hariharan-devarajan/dftracer. A profiler developed for capturing I/O calls. If DFTracer is enabled, profiling trace will be generated at the end of the run. The profiler provides profiling information at both application levels and system I/O calls level. 
 
-To enable this functionality, one has to install DLIO profiler throught 
+To enable this functionality, one has to install DFTracer throught 
 
 .. code-block:: bash 
 
-    pip install dlio-profiler
+    pip install dftracer
 
 
 or 
 
 .. code-block:: bash 
 
-    git clone git@github.com:hariharan-devarajan/dlio-profiler.git
-    cd dlio-profiler
+    git clone git@github.com:hariharan-devarajan/dftracer.git
+    cd dftracer
     python setup.py build
     python setup.py install
 
-Then set ```DLIO_PROFILER_ENABLE=1``` to enable it. Other environemnt variables setting can be found here: https://dlio-profiler.readthedocs.io/en/latest/api.html#configurations-of-dlio-profiler. 
+Then set ```DFTRACER_ENABLE=1``` to enable it. Other environemnt variables setting can be found here: https://dftracer.readthedocs.io/en/latest/api.html#configurations-of-dftracer. 
 
 The profiler outputs all profiling output in <OUTPUT_FOLDER>/.trace*.pfw files.
 It contains application level profiling as well as low-level I/O calls from POSIX and STDIO layers.
diff --git a/docs/source/run.rst b/docs/source/run.rst
index 6a6860ee..c1569e24 100644
--- a/docs/source/run.rst
+++ b/docs/source/run.rst
@@ -75,19 +75,19 @@ These files are in chrome tracing's json line format. This can be visualized usi
 Full Stack Profiling
 '''''''''''''''''''''
 
-DLIO_Benchmark has a optional full stack profiler called `dlio-profiler https://github.com/hariharan-devarajan/dlio-profiler`_. 
+DLIO_Benchmark has a optional full stack profiler called `dftracer https://github.com/hariharan-devarajan/dftracer`_. 
 
 Installing Profiler
 *******************
 
-Installing just dlio-profiler
+Installing just dftracer
 
 .. code-block:: bash
 
-    pip install git+https://github.com/hariharan-devarajan/dlio-profiler.git@dev
+    pip install git+https://github.com/hariharan-devarajan/dftracer.git@dev
 
 
-DLIO Profiler is always installed along with dlio_benchmark
+DFTracer is always installed along with dlio_benchmark
 
 .. code-block:: bash
 
diff --git a/pyproject.toml b/pyproject.toml
index 66900c39..2570ed92 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,12 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest]
+timeout = 3000
+
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"
 log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
 log_cli_date_format = "%Y-%m-%d %H:%M:%S"
-[pytest]
-timeout = 3000
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 537a6552..3374aeb7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,57 +1,18 @@
-absl-py>=1.3.0
-antlr4-python3-runtime>=4.9.3
-astunparse>=1.6.3
-cachetools>=5.2.0
-certifi>=2022.9.24
-charset-normalizer>=2.1.1
-dlio_profiler_py==0.0.5
-flatbuffers>=23.5.26
-gast>=0.4.0
-google-auth>=2.14.1
-google-auth-oauthlib>=0.7.0
-google-pasta>=0.2.0
-grpcio>=1.51.0
-h5py>=3.7.0
-hydra-core>=1.2.0
-idna>=3.4
-keras>=2.15.0
-libclang>=14.0.6
-Markdown>=3.4.1
-MarkupSafe>=2.1.1
-mpi4py>=3.1.4
-numpy>=1.23.5
+--extra-index-url https://download.pytorch.org/whl/cpu
+--extra-index-url https://developer.download.nvidia.com/compute/redist
+
+Pillow~=9.3.0
+PyYAML~=6.0.0
+hydra-core==1.3.2
+mpi4py~=3.1.4
+numpy~=1.23.5
 nvidia-dali-cuda110>=1.34.0
-oauthlib>=3.2.2
-omegaconf>=2.2.3
-opt-einsum>=3.3.0
-packaging>=21.3
-pandas>=1.5.1
-Pillow>=9.3.0
-protobuf>=4.23.4
-psutil>=5.9.8
-pyasn1>=0.4.8
-pyasn1-modules>=0.2.8
-pyparsing>=3.0.9
-python-dateutil>=2.8.2
-pytz>=2022.6
-PyYAML>=6.0
-requests>=2.28.1
-requests-oauthlib>=1.3.1
-rsa>=4.9
-six>=1.16.0
-tensorboard>=2.11.0
-tensorboard-data-server>=0.7.2
-tensorboard-plugin-wit>=1.8.1
+omegaconf~=2.2.0
+pandas~=1.5.1
+psutil~=5.9.8
+pydftracer==1.0.2
+pytest
 tensorflow>=2.11.0
-tensorflow-io>=0.28.0
-tensorflow-estimator>=2.11.0
-termcolor>=2.1.1
-# Use cpu version of torch
---extra-index-url https://download.pytorch.org/whl/cpu torch>=2.2.0
-torchaudio>=2.2.0
-torchvision>=0.17.0
-typing_extensions>=4.9.0
-urllib3>=1.26.12
-Werkzeug>=2.2.2
-wrapt>=1.14.1
-psutil>=5.9.5
+torch>=2.2.0
+torchaudio
+torchvision
diff --git a/setup.py b/setup.py
index 6e97063e..49e0176d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,56 +1,58 @@
-from setuptools import setup, find_namespace_packages
-from glob import glob
 from distutils import util
-configs = glob('dlio_benchmark/configs/**/*', recursive=True)
-print(configs)
-import pathlib, pkg_resources
-import os
-os.system("python -m pip install -r requirements.txt")
+from setuptools import find_namespace_packages, setup
+import pathlib
+
+HYDRA_VERSION = "1.3.2"
+
 test_deps = [
-    'pytest',
+    "pytest",
 ]
 core_deps = [
- 'mpi4py>=3.1.4',
- 'numpy>=1.23.5',
- 'h5py>=3.7.0',
- 'pandas>=1.5.1',
- 'psutil',
- 'dlio_profiler_py==0.0.3'
+    "Pillow~=9.3.0",
+    "PyYAML~=6.0.0",
+    "h5py~=3.11.0",
+    "mpi4py~=3.1.4",
+    "numpy~=1.23.5",
+    "omegaconf~=2.2.0",
+    "pandas~=1.5.1",
+    "psutil~=5.9.8",
+    "pydftracer==1.0.2",
 ]
 x86_deps = [
- 'hydra-core >= 1.2.0',
- 'tensorflow >= 2.11',
- 'torch >= 2.2',
- 'torchaudio',
- 'torchvision',
- 'nvidia-dali-cuda110' 
+    f"hydra-core=={HYDRA_VERSION}",
+    "nvidia-dali-cuda110>=1.34.0",
+    "tensorflow>=2.11.0",
+    "torch>=2.2.0",
+    "torchaudio",
+    "torchvision",
 ]
 ppc_deps = [
- 'hydra-core @ git+https://github.com/facebookresearch/hydra.git@v1.3.2#egg=hydra-core'
+    f"hydra-core @ git+https://github.com/facebookresearch/hydra.git@v{HYDRA_VERSION}#egg=hydra-core"
 ]
 
 deps = core_deps
 
 if "ppc" in util.get_platform():
-  deps.extend(ppc_deps)
+    deps.extend(ppc_deps)
 else:
-  deps.extend(x86_deps)
-print(deps)
+    deps.extend(x86_deps)
+
 extras = {
-    'test': test_deps,
+    "test": test_deps,
 }
-import pathlib
+
 here = pathlib.Path(__file__).parent.resolve()
 long_description = (here / "README.md").read_text(encoding="utf-8")
+
 setup(
-    name='dlio_benchmark',
-    version='2.0',
+    name="dlio_benchmark",
+    version="2.0.0",
     description="An I/O benchmark for deep Learning applications",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/argonne-lcf/dlio_benchmark",
     author="Huihuo Zheng, Hariharan Devarajan (Hari)",
-    email="zhenghh04@gmail.com, mani.hariharan@gmail.com",
+    author_email="zhenghh04@gmail.com, mani.hariharan@gmail.com",
     classifiers=[  # Optional
         # How mature is this project? Common values are
         #   3 - Alpha
@@ -58,10 +60,10 @@
         #   5 - Production/Stable
         "Development Status :: 5 - Production/Stable",
         # Indicate who your project is intended for
-        "Intended Audience :: HPC",
+        "Intended Audience :: Science/Research",
         "Topic :: Software Development :: Build Tools",
         # Pick your license as you wish
-        "License :: OSI Approved :: Apache 2.0 License",
+        "License :: OSI Approved :: Apache Software License",
         # Specify the Python versions you support here. In particular, ensure
         # that you indicate you support Python 3. These classifiers are *not*
         # checked by 'pip install'. See instead 'python_requires' below.
@@ -69,6 +71,8 @@
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Programming Language :: Python :: 3 :: Only",
     ],
     keywords="deep learning, I/O, benchmark, NPZ, pytorch benchmark, tensorflow benchmark",
@@ -79,22 +83,23 @@
     # Main package definition
     packages=find_namespace_packages(where="."),
     package_dir={"dlio_benchmark": "dlio_benchmark"},
-    package_data={'dlio_benchmark.configs': ['*.yaml'],
-                  'dlio_benchmark.configs.hydra.help': ['*.yaml'],
-                  'dlio_benchmark.configs.hydra.job_logging': ['*.yaml'],
-                  'dlio_benchmark.configs.workload': ['*.yaml'],
-                  },
+    package_data={
+        "dlio_benchmark.configs": ["*.yaml"],
+        "dlio_benchmark.configs.hydra.help": ["*.yaml"],
+        "dlio_benchmark.configs.hydra.job_logging": ["*.yaml"],
+        "dlio_benchmark.configs.workload": ["*.yaml"],
+    },
     dependency_links=[
-        'https://download.pytorch.org/whl/cpu',
-        'https://developer.download.nvidia.com/compute/redist'
+        "https://download.pytorch.org/whl/cpu",
+        "https://developer.download.nvidia.com/compute/redist",
     ],
     install_requires=deps,
     tests_require=test_deps,
     extras_require=extras,
     entry_points={
-        'console_scripts': [
-            'dlio_benchmark = dlio_benchmark.main:main',
-            'dlio_postprocessor = dlio_benchmark.postprocessor:main',
+        "console_scripts": [
+            "dlio_benchmark = dlio_benchmark.main:main",
+            "dlio_postprocessor = dlio_benchmark.postprocessor:main",
         ]
-    }
+    },
 )