Merge pull request #55 from ROCm/IFU-2024-02-01

Ifu 2024 02 01
ROCm · Feb 1, 2024 · 4b4a755 · 4b4a755
2 parents f53b42e + e4ab5b6
commit 4b4a755
Show file tree

Hide file tree

Showing 196 changed files with 16,057 additions and 12,398 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -194,6 +194,76 @@ __configure_fbgemm_gpu_build () {
   echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
 }
 
+__build_fbgemm_gpu_set_package_name () {
+  # Determine the package name based on release type and variant
+  export package_name="fbgemm_gpu"
+
+  # Append qualifiers for the non-release version
+  if [ "$fbgemm_release_type" != "release" ]; then
+    export package_name="${package_name}_${fbgemm_release_type}"
+  fi
+
+  # Append cpu or rocm for the non-CUDA case
+  if [ "$fbgemm_variant" == "cpu" ]; then
+    export package_name="${package_name}-cpu"
+  elif [ "$fbgemm_variant" == "rocm" ]; then
+    export package_name="${package_name}-rocm"
+  fi
+
+  echo "[BUILD] Determined and set Python package name to use: ${package_name}"
+}
+
+__build_fbgemm_gpu_set_python_tag () {
+  # shellcheck disable=SC2207,SC2086
+  local python_version=($(conda run --no-capture-output ${env_prefix} python --version))
+
+  # shellcheck disable=SC2206
+  local python_version_arr=(${python_version[1]//./ })
+
+  # Set the python tag (e.g. Python 3.12 -> py312)
+  export python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
+  echo "[BUILD] Extracted and set Python tag: ${python_tag}"
+}
+
+__build_fbgemm_gpu_set_python_plat_name () {
+  if [[ $KERN_NAME == 'Darwin' ]]; then
+    # This follows PyTorch package naming conventions
+    # See https://pypi.org/project/torch/#files
+    if [[ $MACHINE_NAME == 'arm64' ]]; then
+      export python_plat_name="macosx_11_0_${MACHINE_NAME}"
+    else
+      export python_plat_name="macosx_10_9_${MACHINE_NAME}"
+    fi
+
+  elif [[ $KERN_NAME == 'Linux' ]]; then
+    # manylinux2014 is specified, bc manylinux1 does not support aarch64
+    # See https://github.com/pypa/manylinux
+    export python_plat_name="manylinux2014_${MACHINE_NAME}"
+
+  else
+    echo "[BUILD] Unsupported OS platform: ${KERN_NAME}"
+    return 1
+  fi
+
+  echo "[BUILD] Extracted and set Python platform name: ${python_plat_name}"
+}
+
+__build_fbgemm_gpu_set_run_multicore () {
+  # shellcheck disable=SC2155
+  local core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
+  # shellcheck disable=SC2155
+  local sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
+  local re='^[0-9]+$'
+
+  export run_multicore=""
+  if [[ $core =~ $re && $sockets =~ $re ]] ; then
+    local n_core=$((core * sockets))
+    export run_multicore=" -j ${n_core}"
+  fi
+
+  echo "[BUILD] Set multicore run option for setup.py: ${run_multicore}"
+}
+
 __build_fbgemm_gpu_common_pre_steps () {
   # Private function that uses variables instantiated by its caller
 
@@ -203,28 +273,23 @@ __build_fbgemm_gpu_common_pre_steps () {
   (test_binpath "${env_name}" c++) || return 1
   (test_binpath "${env_name}" g++) || return 1
 
-  # Determine the package name based on release type and variant
-  package_name="fbgemm_gpu"
-  if [ "$fbgemm_release_type" != "release" ]; then
-    package_name="${package_name}_${fbgemm_release_type}"
+  # Set the default the FBGEMM_GPU variant to be CUDA
+  if [ "$fbgemm_variant" != "cpu" ] && [ "$fbgemm_variant" != "rocm" ]; then
+    export fbgemm_variant="cuda"
   fi
-  if [ "$fbgemm_variant" == "cpu" ]; then
-    package_name="${package_name}-cpu"
-  elif [ "$fbgemm_variant" == "rocm" ]; then
-    package_name="${package_name}-rocm"
-  else
-    # Set to the default variant
-    fbgemm_variant="cuda"
-  fi
-  echo "[BUILD] Determined Python package name to use: ${package_name}"
 
-  # Extract the Python tag
-  # shellcheck disable=SC2207,SC2086
-  python_version=($(conda run --no-capture-output ${env_prefix} python --version))
-  # shellcheck disable=SC2206
-  python_version_arr=(${python_version[1]//./ })
-  python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
-  echo "[BUILD] Extracted Python tag: ${python_tag}"
+  # Extract and set the package name given the FBGEMM_GPU variant
+  __build_fbgemm_gpu_set_package_name
+
+  # Extract and set the Python tag
+  __build_fbgemm_gpu_set_python_tag
+
+  # Extract and set the platform name
+  __build_fbgemm_gpu_set_python_plat_name
+
+  # Set multicore run option for setup.py if the number of cores on the machine
+  # permit for this
+  __build_fbgemm_gpu_set_run_multicore
 
   echo "[BUILD] Running pre-build cleanups ..."
   print_exec rm -rf dist
@@ -332,33 +397,14 @@ build_fbgemm_gpu_package () {
   echo "################################################################################"
   echo ""
 
-  # manylinux2014 is specified, bc manylinux1 does not support aarch64
-  # See https://github.com/pypa/manylinux
-  local plat_name="manylinux2014_${MACHINE_NAME}"
-
-  echo "[BUILD] Checking arch_list = ${arch_list}"
-  echo "[BUILD] Checking build_args:"
-  echo "${build_args[@]}"
-
-  # shellcheck disable=SC2155
-  local core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
-  # shellcheck disable=SC2155
-  local sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
-  local re='^[0-9]+$'
-  local run_multicore=""
-  if [[ $core =~ $re && $sockets =~ $re ]] ; then
-    local n_core=$((core * sockets))
-    local run_multicore=" -j ${n_core}"
-  fi
-
   # Distribute Python extensions as wheels on Linux
   echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
     python setup.py "${run_multicore}" bdist_wheel \
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
-      --plat-name="${plat_name}" \
+      --plat-name="${python_plat_name}" \
       --verbose \
       "${build_args[@]}"
 
@@ -410,7 +456,9 @@ build_fbgemm_gpu_install () {
   echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
-    python setup.py install "${build_args[@]}"
+    python setup.py "${run_multicore}" install \
+      --verbose \
+      "${build_args[@]}"
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
@@ -460,7 +508,9 @@ build_fbgemm_gpu_develop () {
   echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
   # shellcheck disable=SC2086
   print_exec conda run --no-capture-output ${env_prefix} \
-    python setup.py build develop "${build_args[@]}"
+    python setup.py "${run_multicore}" build develop \
+      --verbose \
+      "${build_args[@]}"
 
   # Run checks on the built libraries
   (run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1

diff --git a/.github/scripts/fbgemm_gpu_docs.bash b/.github/scripts/fbgemm_gpu_docs.bash
@@ -34,14 +34,16 @@ install_docs_tools () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  echo "[INSTALL] Installing Doxygen ..."
+  echo "[INSTALL] Installing documentation tools ..."
 
   # shellcheck disable=SC2086
   (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
     doxygen \
+    graphviz \
     make) || return 1
 
   # Check binaries are visible in the PATH
+  (test_binpath "${env_name}" dot) || return 1
   (test_binpath "${env_name}" doxygen) || return 1
   (test_binpath "${env_name}" make) || return 1
 
@@ -76,13 +78,31 @@ build_fbgemm_gpu_docs () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  echo "[BUILD] Running Doxygen build ..."
+  echo "[DOCS] Running the first-pass build (i.e. documentation linting) ..."
   # shellcheck disable=SC2086
-  (exec_with_retries 3 conda run ${env_prefix} doxygen Doxyfile.in) || return 1
+  print_exec conda env config vars set ${env_prefix} SPHINX_LINT=1
 
-  echo "[BUILD] Building HTML pages ..."
+  # Run the first build pass with linting enabled.  The purpose of this pass
+  # is only to perform the lint checks, as the generated output will be broken
+  # when linting is enabled.
   # shellcheck disable=SC2086
-  (exec_with_retries 3 conda run ${env_prefix} make html) || return 1
+  if print_exec conda run ${env_prefix} make clean doxygen html; then
+    echo "[DOCS] Docs linting passed"
+  else
+    echo "[DOCS] Docs linting failed; showing build output ..."
+    # Show the buidl logs on error
+    cat build/html/output.txt || true
+    return 1
+  fi
+
+  echo "[DOCS] Running the second-pass documentation build ..."
+  # shellcheck disable=SC2086
+  print_exec conda env config vars unset ${env_prefix} SPHINX_LINT
+
+  # Run the second build pass with linting disabled.  The generated output will
+  # then be used for publication.
+  # shellcheck disable=SC2086
+  (print_exec conda run ${env_prefix} make clean doxygen html) || return 1
 
-  echo "[INSTALL] FBGEMM-GPU documentation build completed"
+  echo "[DOCS] FBGEMM-GPU documentation build completed"
 }
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -35,9 +35,13 @@ run_python_test () {
   if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
+    echo ""
+    echo ""
   else
     echo "[TEST] Python test suite FAILED: ${python_test_file}"
     echo ""
+    echo ""
+    echo ""
     return 1
   fi
 }
@@ -80,20 +84,23 @@ run_fbgemm_gpu_tests () {
 
   # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
   local files_to_skip=(
-    test_utils.py
-    split_table_batched_embeddings_test.py
-    ssd_split_table_batched_embeddings_test.py
+    ./ssd_split_table_batched_embeddings_test.py
   )
 
   if [ "$fbgemm_variant" == "cpu" ]; then
-    # These are tests that are currently broken in FBGEMM_GPU-CPU
+    # These tests have non-CPU operators referenced in @given
     local ignored_tests=(
-      uvm_test.py
+      ./uvm/copy_test.py
+      ./uvm/uvm_test.py
     )
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    # https://github.com/pytorch/FBGEMM/issues/1559
     local ignored_tests=(
-      batched_unary_embeddings_test.py
+      # https://github.com/pytorch/FBGEMM/issues/1559
+      ./batched_unary_embeddings_test.py
+      ./tbe/backward_adagrad_test.py
+      ./tbe/backward_dense_test.py
+      ./tbe/backward_none_test.py
+      ./tbe/backward_sgd_test.py
     )
   else
     local ignored_tests=()
@@ -108,11 +115,14 @@ run_fbgemm_gpu_tests () {
   (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
 
   echo "[TEST] Enumerating test files ..."
-  print_exec ls -lth ./*.py
+  # shellcheck disable=SC2155
+  local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
+  for f in $all_test_files; do echo "$f"; done
+  echo ""
 
   # NOTE: Tests running on single CPU core with a less powerful testing GPU in
   # GHA can take up to 5 hours.
-  for test_file in *.py; do
+  for test_file in $all_test_files; do
     if echo "${files_to_skip[@]}" | grep "${test_file}"; then
       echo "[TEST] Skipping test file known to be broken: ${test_file}"
     elif echo "${ignored_tests[@]}" | grep "${test_file}"; then

diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -91,10 +91,10 @@ print_gpu_info () {
 
   (lspci -v | grep -e 'controller.*NVIDIA') || true
 
-  if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then
+  if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
     # Ensure that nvidia-smi is available and returns GPU entries
     if ! nvidia-smi; then
-      echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
+      echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
       return 1
     fi
   else
@@ -111,10 +111,10 @@ print_gpu_info () {
 
   (lspci -v | grep -e 'Display controller: Advanced') || true
 
-  if [[ "${ENFORCE_AMD_GPU}" ]]; then
+  if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
     # Ensure that rocm-smi is available and returns GPU entries
     if ! rocm-smi; then
-      echo "[CHECK] AMD driver is required, but does not appear to have been installed.  This will cause FBGEMM_GPU installation to fail!"
+      echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
       return 1
     fi
   else

diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml
@@ -18,6 +18,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   generate-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
@@ -55,6 +59,3 @@ jobs:
       trigger-event: ${{ github.event_name }}
       architecture: aarch64
       setup-miniconda: false
-    secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build_wheels_linux_x86.yml b/.github/workflows/build_wheels_linux_x86.yml
@@ -18,6 +18,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   generate-matrix:
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
@@ -45,6 +49,3 @@ jobs:
       test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       trigger-event: ${{ github.event_name }}
-    secrets:
-      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -132,12 +132,15 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
-      ENFORCE_NVIDIA_GPU: 1
+      ENFORCE_CUDA_DEVICE: 1
     strategy:
       fail-fast: false
       matrix:
         host-machine: [
           { arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
+          # TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
+          # https://hud.pytorch.org/metrics
+          # { arch: x86, instance: "linux.gcp.a100" },
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
         cuda-version: [ "11.8.0", "12.1.1" ]

diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -134,7 +134,7 @@ jobs:
     env:
       PRELUDE: .github/scripts/setup_env.bash
       BUILD_ENV: build_binary
-      ENFORCE_AMD_GPU: 1
+      ENFORCE_ROCM_DEVICE: 1
     strategy:
       fail-fast: false
       matrix:

diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
@@ -24,6 +24,9 @@ on:
 
 jobs:
   build-docs:
+    permissions:
+      # Grant write permission here so that the generated docs can be pushed to `gh-pages` branch
+      contents: write
     runs-on: linux.2xlarge
     container:
       image: amazonlinux:2023