Skip to content

Commit

Permalink
Merge pull request #55 from ROCm/IFU-2024-02-01
Browse files Browse the repository at this point in the history
Ifu 2024 02 01
  • Loading branch information
liligwu authored Feb 1, 2024
2 parents f53b42e + e4ab5b6 commit 4b4a755
Show file tree
Hide file tree
Showing 196 changed files with 16,057 additions and 12,398 deletions.
134 changes: 92 additions & 42 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,76 @@ __configure_fbgemm_gpu_build () {
echo "[BUILD] FBGEMM_GPU build arguments have been set: ${build_args[@]}"
}

__build_fbgemm_gpu_set_package_name () {
# Determine the package name based on release type and variant
export package_name="fbgemm_gpu"

# Append qualifiers for the non-release version
if [ "$fbgemm_release_type" != "release" ]; then
export package_name="${package_name}_${fbgemm_release_type}"
fi

# Append cpu or rocm for the non-CUDA case
if [ "$fbgemm_variant" == "cpu" ]; then
export package_name="${package_name}-cpu"
elif [ "$fbgemm_variant" == "rocm" ]; then
export package_name="${package_name}-rocm"
fi

echo "[BUILD] Determined and set Python package name to use: ${package_name}"
}

__build_fbgemm_gpu_set_python_tag () {
# shellcheck disable=SC2207,SC2086
local python_version=($(conda run --no-capture-output ${env_prefix} python --version))

# shellcheck disable=SC2206
local python_version_arr=(${python_version[1]//./ })

# Set the python tag (e.g. Python 3.12 -> py312)
export python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
echo "[BUILD] Extracted and set Python tag: ${python_tag}"
}

__build_fbgemm_gpu_set_python_plat_name () {
if [[ $KERN_NAME == 'Darwin' ]]; then
# This follows PyTorch package naming conventions
# See https://pypi.org/project/torch/#files
if [[ $MACHINE_NAME == 'arm64' ]]; then
export python_plat_name="macosx_11_0_${MACHINE_NAME}"
else
export python_plat_name="macosx_10_9_${MACHINE_NAME}"
fi

elif [[ $KERN_NAME == 'Linux' ]]; then
# manylinux2014 is specified, bc manylinux1 does not support aarch64
# See https://github.com/pypa/manylinux
export python_plat_name="manylinux2014_${MACHINE_NAME}"

else
echo "[BUILD] Unsupported OS platform: ${KERN_NAME}"
return 1
fi

echo "[BUILD] Extracted and set Python platform name: ${python_plat_name}"
}

__build_fbgemm_gpu_set_run_multicore () {
# shellcheck disable=SC2155
local core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
# shellcheck disable=SC2155
local sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
local re='^[0-9]+$'

export run_multicore=""
if [[ $core =~ $re && $sockets =~ $re ]] ; then
local n_core=$((core * sockets))
export run_multicore=" -j ${n_core}"
fi

echo "[BUILD] Set multicore run option for setup.py: ${run_multicore}"
}

__build_fbgemm_gpu_common_pre_steps () {
# Private function that uses variables instantiated by its caller

Expand All @@ -203,28 +273,23 @@ __build_fbgemm_gpu_common_pre_steps () {
(test_binpath "${env_name}" c++) || return 1
(test_binpath "${env_name}" g++) || return 1

# Determine the package name based on release type and variant
package_name="fbgemm_gpu"
if [ "$fbgemm_release_type" != "release" ]; then
package_name="${package_name}_${fbgemm_release_type}"
# Set the default the FBGEMM_GPU variant to be CUDA
if [ "$fbgemm_variant" != "cpu" ] && [ "$fbgemm_variant" != "rocm" ]; then
export fbgemm_variant="cuda"
fi
if [ "$fbgemm_variant" == "cpu" ]; then
package_name="${package_name}-cpu"
elif [ "$fbgemm_variant" == "rocm" ]; then
package_name="${package_name}-rocm"
else
# Set to the default variant
fbgemm_variant="cuda"
fi
echo "[BUILD] Determined Python package name to use: ${package_name}"

# Extract the Python tag
# shellcheck disable=SC2207,SC2086
python_version=($(conda run --no-capture-output ${env_prefix} python --version))
# shellcheck disable=SC2206
python_version_arr=(${python_version[1]//./ })
python_tag="py${python_version_arr[0]}${python_version_arr[1]}"
echo "[BUILD] Extracted Python tag: ${python_tag}"
# Extract and set the package name given the FBGEMM_GPU variant
__build_fbgemm_gpu_set_package_name

# Extract and set the Python tag
__build_fbgemm_gpu_set_python_tag

# Extract and set the platform name
__build_fbgemm_gpu_set_python_plat_name

# Set multicore run option for setup.py if the number of cores on the machine
# permit for this
__build_fbgemm_gpu_set_run_multicore

echo "[BUILD] Running pre-build cleanups ..."
print_exec rm -rf dist
Expand Down Expand Up @@ -332,33 +397,14 @@ build_fbgemm_gpu_package () {
echo "################################################################################"
echo ""

# manylinux2014 is specified, bc manylinux1 does not support aarch64
# See https://github.com/pypa/manylinux
local plat_name="manylinux2014_${MACHINE_NAME}"

echo "[BUILD] Checking arch_list = ${arch_list}"
echo "[BUILD] Checking build_args:"
echo "${build_args[@]}"

# shellcheck disable=SC2155
local core=$(lscpu | grep "Core(s)" | awk '{print $NF}') && echo "core = ${core}" || echo "core not found"
# shellcheck disable=SC2155
local sockets=$(lscpu | grep "Socket(s)" | awk '{print $NF}') && echo "sockets = ${sockets}" || echo "sockets not found"
local re='^[0-9]+$'
local run_multicore=""
if [[ $core =~ $re && $sockets =~ $re ]] ; then
local n_core=$((core * sockets))
local run_multicore=" -j ${n_core}"
fi

# Distribute Python extensions as wheels on Linux
echo "[BUILD] Building FBGEMM-GPU wheel (VARIANT=${fbgemm_variant}) ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py "${run_multicore}" bdist_wheel \
--package_name="${package_name}" \
--python-tag="${python_tag}" \
--plat-name="${plat_name}" \
--plat-name="${python_plat_name}" \
--verbose \
"${build_args[@]}"

Expand Down Expand Up @@ -410,7 +456,9 @@ build_fbgemm_gpu_install () {
echo "[BUILD] Building + installing FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py install "${build_args[@]}"
python setup.py "${run_multicore}" install \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
(run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
Expand Down Expand Up @@ -460,7 +508,9 @@ build_fbgemm_gpu_develop () {
echo "[BUILD] Building (develop) FBGEMM-GPU (VARIANT=${fbgemm_variant}) ..."
# shellcheck disable=SC2086
print_exec conda run --no-capture-output ${env_prefix} \
python setup.py build develop "${build_args[@]}"
python setup.py "${run_multicore}" build develop \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
(run_fbgemm_gpu_postbuild_checks "${fbgemm_variant}") || return 1
Expand Down
32 changes: 26 additions & 6 deletions .github/scripts/fbgemm_gpu_docs.bash
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ install_docs_tools () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "[INSTALL] Installing Doxygen ..."
echo "[INSTALL] Installing documentation tools ..."

# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
doxygen \
graphviz \
make) || return 1

# Check binaries are visible in the PATH
(test_binpath "${env_name}" dot) || return 1
(test_binpath "${env_name}" doxygen) || return 1
(test_binpath "${env_name}" make) || return 1

Expand Down Expand Up @@ -76,13 +78,31 @@ build_fbgemm_gpu_docs () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

echo "[BUILD] Running Doxygen build ..."
echo "[DOCS] Running the first-pass build (i.e. documentation linting) ..."
# shellcheck disable=SC2086
(exec_with_retries 3 conda run ${env_prefix} doxygen Doxyfile.in) || return 1
print_exec conda env config vars set ${env_prefix} SPHINX_LINT=1

echo "[BUILD] Building HTML pages ..."
# Run the first build pass with linting enabled. The purpose of this pass
# is only to perform the lint checks, as the generated output will be broken
# when linting is enabled.
# shellcheck disable=SC2086
(exec_with_retries 3 conda run ${env_prefix} make html) || return 1
if print_exec conda run ${env_prefix} make clean doxygen html; then
echo "[DOCS] Docs linting passed"
else
echo "[DOCS] Docs linting failed; showing build output ..."
# Show the buidl logs on error
cat build/html/output.txt || true
return 1
fi

echo "[DOCS] Running the second-pass documentation build ..."
# shellcheck disable=SC2086
print_exec conda env config vars unset ${env_prefix} SPHINX_LINT

# Run the second build pass with linting disabled. The generated output will
# then be used for publication.
# shellcheck disable=SC2086
(print_exec conda run ${env_prefix} make clean doxygen html) || return 1

echo "[INSTALL] FBGEMM-GPU documentation build completed"
echo "[DOCS] FBGEMM-GPU documentation build completed"
}
28 changes: 19 additions & 9 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,13 @@ run_python_test () {
if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
echo "[TEST] Python test suite PASSED: ${python_test_file}"
echo ""
echo ""
echo ""
else
echo "[TEST] Python test suite FAILED: ${python_test_file}"
echo ""
echo ""
echo ""
return 1
fi
}
Expand Down Expand Up @@ -80,20 +84,23 @@ run_fbgemm_gpu_tests () {

# These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
local files_to_skip=(
test_utils.py
split_table_batched_embeddings_test.py
ssd_split_table_batched_embeddings_test.py
./ssd_split_table_batched_embeddings_test.py
)

if [ "$fbgemm_variant" == "cpu" ]; then
# These are tests that are currently broken in FBGEMM_GPU-CPU
# These tests have non-CPU operators referenced in @given
local ignored_tests=(
uvm_test.py
./uvm/copy_test.py
./uvm/uvm_test.py
)
elif [ "$fbgemm_variant" == "rocm" ]; then
# https://github.com/pytorch/FBGEMM/issues/1559
local ignored_tests=(
batched_unary_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
./tbe/backward_adagrad_test.py
./tbe/backward_dense_test.py
./tbe/backward_none_test.py
./tbe/backward_sgd_test.py
)
else
local ignored_tests=()
Expand All @@ -108,11 +115,14 @@ run_fbgemm_gpu_tests () {
(test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1

echo "[TEST] Enumerating test files ..."
print_exec ls -lth ./*.py
# shellcheck disable=SC2155
local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
for f in $all_test_files; do echo "$f"; done
echo ""

# NOTE: Tests running on single CPU core with a less powerful testing GPU in
# GHA can take up to 5 hours.
for test_file in *.py; do
for test_file in $all_test_files; do
if echo "${files_to_skip[@]}" | grep "${test_file}"; then
echo "[TEST] Skipping test file known to be broken: ${test_file}"
elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
Expand Down
8 changes: 4 additions & 4 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ print_gpu_info () {

(lspci -v | grep -e 'controller.*NVIDIA') || true

if [[ "${ENFORCE_NVIDIA_GPU}" ]]; then
if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!"
echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
return 1
fi
else
Expand All @@ -111,10 +111,10 @@ print_gpu_info () {

(lspci -v | grep -e 'Display controller: Advanced') || true

if [[ "${ENFORCE_AMD_GPU}" ]]; then
if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
echo "[CHECK] AMD driver is required, but does not appear to have been installed. This will cause FBGEMM_GPU installation to fail!"
echo "[CHECK] ROCm drivers and ROCm device are required for this workflow, but does not appear to be installed or available!"
return 1
fi
else
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/build_wheels_linux_aarch64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

permissions:
id-token: write
contents: read

jobs:
generate-matrix:
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
Expand Down Expand Up @@ -55,6 +59,3 @@ jobs:
trigger-event: ${{ github.event_name }}
architecture: aarch64
setup-miniconda: false
secrets:
AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
7 changes: 4 additions & 3 deletions .github/workflows/build_wheels_linux_x86.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

permissions:
id-token: write
contents: read

jobs:
generate-matrix:
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
Expand Down Expand Up @@ -45,6 +49,3 @@ jobs:
test-infra-ref: main
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
trigger-event: ${{ github.event_name }}
secrets:
AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
5 changes: 4 additions & 1 deletion .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,15 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_NVIDIA_GPU: 1
ENFORCE_CUDA_DEVICE: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "linux.g5.4xlarge.nvidia.gpu" },
# TODO: Enable when A100 machine queues are reasonably small enough for doing per-PR CI
# https://hud.pytorch.org/metrics
# { arch: x86, instance: "linux.gcp.a100" },
]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1" ]
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ jobs:
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
ENFORCE_AMD_GPU: 1
ENFORCE_ROCM_DEVICE: 1
strategy:
fail-fast: false
matrix:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/fbgemm_gpu_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ on:

jobs:
build-docs:
permissions:
# Grant write permission here so that the generated docs can be pushed to `gh-pages` branch
contents: write
runs-on: linux.2xlarge
container:
image: amazonlinux:2023
Expand Down
Loading

0 comments on commit 4b4a755

Please sign in to comment.