diff --git a/.github/actions/setup-build-cuda/action.yml b/.github/actions/setup-build-cuda/action.yml
index 824be1bd6b..7bdfff4360 100644
--- a/.github/actions/setup-build-cuda/action.yml
+++ b/.github/actions/setup-build-cuda/action.yml
@@ -23,15 +23,19 @@ runs:
         import sys
         print(sys.version)
         cushort = "${{ inputs.toolkit_short_version }}"
-        TORCH_CUDA_DEFAULT = "121"  # pytorch 2.4.1
+        # Version uploaded to pypi (rather than PyTorch s3)
+        TORCH_CUDA_DEFAULT = "124"  # since pytorch 2.6.0
         # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/linux-links.ts
         full_version, install_script = {
+          "128": ("12.8.0", "https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run"),
+          "126": ("12.6.3", "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run"),
           "124": ("12.4.1", "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run"),
           "121": ("12.1.0", "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run"),
           "118": ("11.8.0", "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"),
           "6.0": ("6.0.2", "https://repo.radeon.com/amdgpu-install/6.0.2/rhel/8.9/amdgpu-install-6.0.60002-1.el8.noarch.rpm"),
           "6.1": ("6.1.2", "https://repo.radeon.com/amdgpu-install/6.1.3/rhel/8.9/amdgpu-install-6.1.60103-1.el8.noarch.rpm"),
-          "6.2": ("6.2.3", "https://repo.radeon.com/amdgpu-install/6.2.3/rhel/8.9/amdgpu-install-6.2.60203-1.el8.noarch.rpm"),
+          "6.2.4": ("6.2.4", "https://repo.radeon.com/amdgpu-install/6.2.4/rhel/8.9/amdgpu-install-6.2.60204-1.el8.noarch.rpm"),
+          "6.3": ("6.3.1", "https://repo.radeon.com/amdgpu-install/6.3.1/rhel/8.9/amdgpu-install-6.3.60301-1.el8.noarch.rpm"),
         }[cushort]
         with open(os.environ['GITHUB_OUTPUT'], "r+") as fp:
           fp.write("CUDA_VERSION=" + full_version + "\n")
@@ -46,10 +50,18 @@ runs:
     # WINDOWS STEPS
     - name: Install cuda
       if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
-      uses: Jimver/cuda-toolkit@v0.2.16
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.21
       with:
         cuda: ${{ steps.cuda_info.outputs.CUDA_VERSION }}
         method: network
+    - if: runner.os == 'Windows' && inputs.toolkit_type == 'cuda'
+      shell: bash
+      run: |
+        echo "Installed cuda version is: ${{ steps.cuda-toolkit.outputs.cuda }}"
+        echo "Cuda install location: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}"
+        echo "CUDA_HOME=${{ steps.cuda-toolkit.outputs.CUDA_PATH }}" >> ${GITHUB_ENV}
+        cat ${GITHUB_ENV}
 
     - name: Install python
       if: runner.os == 'Windows'
diff --git a/.github/selective_ci/selective_ci.py b/.github/selective_ci/selective_ci.py
index a24fc3e351..5c57ef694a 100644
--- a/.github/selective_ci/selective_ci.py
+++ b/.github/selective_ci/selective_ci.py
@@ -109,6 +109,9 @@ def list_files_in_commit(commit: git.Commit):
 
 
 def check_patterns_are_valid(patterns):
+    # Only check patterns in `fairinternal` repo
+    if os.environ.get("GITHUB_REPOSITORY", "") != "fairinternal/xformers":
+        return
     found_patterns = set()
     for f in all_files:
         for pattern in patterns:
diff --git a/.github/workflows/rocm_build.yml b/.github/workflows/rocm_build.yml
index 37fe17b4ec..0f638d1fb9 100644
--- a/.github/workflows/rocm_build.yml
+++ b/.github/workflows/rocm_build.yml
@@ -22,9 +22,9 @@ jobs:
       matrix:
         os: ['ubuntu-alola']
         python: ['3.11']
-        torch_version: ['2.5.1']
+        torch_version: ['2.6.0']
         toolkit_type: ['rocm']
-        toolkit_short_version: ['6.1', '6.2']
+        toolkit_short_version: ['6.1', '6.2.4']
 
     uses: ./.github/workflows/wheels_build.yml
     if: github.repository == 'rocm/xformers'
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index db6ba6572a..d92cce53e5 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -31,8 +31,8 @@ jobs:
         PY_VERSIONS = ['3.9', '3.10', '3.11', '3.12']
         # NOTE: Don't forget to update `upload_pt`'s matrix
         # when changing the CUDA/ROCM versions below!
-        CU_VERSIONS = ['118', '121', '124']
-        ROCM_VERSIONS = ["6.1"] # <- 6.0 broken in `manylinux_2_28`
+        CU_VERSIONS = ['118', '124', '126']
+        ROCM_VERSIONS = ["6.1", "6.2.4"] # <- 6.0 broken in `manylinux_2_28`
         PY_CU = list(itertools.product(PY_VERSIONS, CU_VERSIONS))
         PY_ROCM = list(itertools.product(PY_VERSIONS, ROCM_VERSIONS))
         print("Full matrix PY_CU", PY_CU)
@@ -42,11 +42,13 @@ jobs:
           for cu in CU_VERSIONS[1:]:
             PY_CU.append((PY_VERSIONS[-1], cu))
           print("Limited matrix PY_CU", PY_CU)
-          PY_ROCM = [(PY_VERSIONS[-1], ROCM_VERSIONS[-1])]
+          PY_ROCM = [(PY_VERSIONS[0], ROCM_VERSIONS[0])]
+          for rocm in ROCM_VERSIONS[1:]:
+            PY_ROCM.append((PY_VERSIONS[-1], rocm))
 
         include = []
         for os in ['8-core-ubuntu', 'windows-8-core']:
-          for torch_version in ['2.5.1']:
+          for torch_version in ['2.6.0']:
             # CUDA builds
             for python, cuda_short_version in PY_CU:
               if cuda_short_version != "124" and "windows" in os:
@@ -96,7 +98,7 @@ jobs:
     uses: ./.github/workflows/wheels_upload_pip.yml
     with:
       twine_username: __token__
-      filter: "*torch2.5.1+cu121*"
+      filter: "*torch2.6.0+cu124*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.event_name != 'pull_request' }}
     secrets:
       twine_password: ${{ secrets.PYPI_TOKEN }}
@@ -108,14 +110,15 @@ jobs:
       matrix:
         suffix:
           - cu118
-          - cu121
           - cu124
+          - cu126
           - rocm6.1
+          - rocm6.2.4
     uses: ./.github/workflows/wheels_upload_s3.yml
     with:
       aws_role: "arn:aws:iam::749337293305:role/pytorch_bot_uploader_role"
       s3_path: s3://pytorch/whl/${{ matrix.suffix }}/
       aws_s3_cp_extra_args: --acl public-read
-      filter: "*torch2.5.1+${{ matrix.suffix }}*"
+      filter: "*torch2.6.0+${{ matrix.suffix }}*"
       execute: ${{ github.repository == 'facebookresearch/xformers' && github.ref_type == 'tag' }}
 
diff --git a/.github/workflows/wheels_build.yml b/.github/workflows/wheels_build.yml
index 4e9e1ccd50..4df007b805 100644
--- a/.github/workflows/wheels_build.yml
+++ b/.github/workflows/wheels_build.yml
@@ -65,6 +65,13 @@ jobs:
           submodules: recursive
           path: "."
           fetch-depth: 0 # for tags
+
+      - name: HACKFIX for cutlass compiler bug
+        if: runner.os == 'Windows'
+        run: |
+          # See https://github.com/NVIDIA/cutlass/issues/1732
+          rm -f third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+          touch third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
       - name: Setup Runner
         uses: ./.github/actions/setup-build-cuda
         with:
@@ -98,6 +105,7 @@ jobs:
           echo "BUILD_VERSION=$version${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}" >> ${GITHUB_ENV}
           echo "BUILD_VERSION=$version${{ steps.cuda_info.outputs.CUDA_VERSION_SUFFIX }}" >> ${GITHUB_OUTPUT}
           which ninja
+          ninja --version
           cat ${GITHUB_ENV}
       - run: echo "xformers-${BUILD_VERSION}"
       - run: echo "release version (will upload to PyTorch)"
diff --git a/.github/workflows/win-build.yml b/.github/workflows/win-build.yml
index 02fc26c2c8..f81b9ade71 100644
--- a/.github/workflows/win-build.yml
+++ b/.github/workflows/win-build.yml
@@ -15,6 +15,7 @@ env:
   MAX_JOBS: 6
   DISTUTILS_USE_SDK: 1 # otherwise distutils will complain on windows about multiple versions of msvc
   XFORMERS_BUILD_TYPE: "Release"
+  TMPDIR: "./x"
 
 jobs:
   win_build:
@@ -35,19 +36,33 @@ jobs:
       run:
         shell: bash
     steps:
-      - name: Support longpaths
-        run: git config --system core.longpaths true
+      - name: Workarounds for longpaths - git-config
+        run: |
+          git config --system core.longpaths true
       - name: Recursive checkout
         uses: actions/checkout@v3
         with:
           submodules: recursive
           path: "."
 
+      - name: Workarounds for longpaths - TMPDIR
+        run: |
+          mkdir x
+          python -c "import tempfile; print(tempfile.gettempdir())"
+          python -c "import tempfile; assert(len(tempfile.gettempdir()) < 30)"
+
+      - name: HACKFIX for cutlass compiler bug
+        if: runner.os == 'Windows'
+        run: |
+          # See https://github.com/NVIDIA/cutlass/issues/1732
+          rm -f third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+          touch third_party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+
       - name: Setup Runner
         uses: ./.github/actions/setup-build-cuda
         with:
           toolkit_type: "cuda"
-          toolkit_short_version: "124"
+          toolkit_short_version: "126"
           python: "3.9"
 
       - name: Remove internal code
@@ -59,18 +74,26 @@ jobs:
 
       - name: Install build dependencies
         run: |
-          $PY -m pip install wheel setuptools ninja torch==2.5.1 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121
+          $PY -m pip install wheel setuptools ninja torch==2.6.0 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu126
           git config --global --add safe.directory "*"
           $PY -c "import torch; print('torch', torch.__version__)"
           $PY -c "import torch; print('torch.cuda', torch.version.cuda)"
+          ninja --version
 
       - name: Create sdist
         run: $PY setup.py sdist
 
       - name: Build from sdist
-        run: $PY -m pip install -v dist/*
+        shell: bash -l {0}
+        run: |
+          $PY -m pip install -v dist/*
 
       - name: Info
         run: |
           cd ../../  # So we don't have a folder named `xformers`
           XFORMERS_MORE_DETAILS=1 $PY -m xformers.info
+
+      # Open an SSH session on failure to debug
+      # - name: Setup tmate session
+      #   if: ${{ failure() }}
+      #   uses: mxschmitt/action-tmate@v3
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0990bd2476..bbdf675c46 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.0.29.post2] - 2025-01-31
+Pre-built binary wheels are available for PyTorch 2.6.0. Following PyTorch, we build wheels for CUDA 11.8, 12.4, and 12.6 only (we no longer build for CUDA 12.1).
+xFormers now requires PyTorch >= 2.6
+
+
 ## [0.0.29] - 2024-12-27
 ### Improved:
 - [fMHA] Creating a `LowerTriangularMask` no longer creates a CUDA tensor
diff --git a/README.md b/README.md
index 9332159006..bd181d87cd 100644
--- a/README.md
+++ b/README.md
@@ -25,17 +25,17 @@ xFormers is:
 
 ## Installing xFormers
 
-* **(RECOMMENDED, linux & win) Install latest stable with pip**: Requires [PyTorch 2.5.1](https://pytorch.org/get-started/locally/)
+* **(RECOMMENDED, linux & win) Install latest stable with pip**: Requires [PyTorch 2.6.0](https://pytorch.org/get-started/locally/)
 
 ```bash
 # [linux only] cuda 11.8 version
 pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
-# [linux only] cuda 12.1 version
-pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
 # [linux & win] cuda 12.4 version
 pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu124
-# [linux only] (EXPERIMENTAL) rocm 6.1 version
-pip3 install -U xformers --index-url https://download.pytorch.org/whl/rocm6.1
+# [linux & win] cuda 12.6 version
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu126
+# [linux only] (EXPERIMENTAL) rocm 6.2.4 version
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/rocm6.2.4
 ```
 
 * **Development binaries**:
diff --git a/requirements.txt b/requirements.txt
index f1fe423f23..fb1e7934ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 # Example requirement, can be anything that pip knows
 # install with `pip install -r requirements.txt`, and make sure that CI does the same
-torch >= 2.4
+torch >= 2.6
 numpy
diff --git a/xformers/components/attention/core.py b/xformers/components/attention/core.py
index 3a201fb512..3e80e917dc 100644
--- a/xformers/components/attention/core.py
+++ b/xformers/components/attention/core.py
@@ -103,7 +103,7 @@ def _matmul_with_mask(
             repeat_factor = att.shape[0] // mask.shape[0]
             mask = mask.repeat([repeat_factor, 1, 1])
             logger.info("Mismatched batch dimensions for mask, repeating mask.")
-        att += mask
+        att += mask  # type: ignore
     return att