diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index a35c6d42c4..ac9eaf6822 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -20,11 +20,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -35,8 +30,13 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+        - name: cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: cpu-3.11-2.5-composer
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
@@ -45,11 +45,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -60,13 +55,18 @@ jobs:
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
-        - name: daily-cpu-3.11-2.4-composer
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+        - name: daily-cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: daily and (remote or not remote) and not gpu and not doctest
+          pytest_command: coverage run -m pytest
+          composer_package_name: mosaicml
+        - name: daily-cpu-3.11-2.5-composer
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
           pytest_command: coverage run -m pytest
           composer_package_name: composer
         - name: daily-cpu-doctest
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
@@ -107,12 +107,6 @@ jobs:
         include:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
-        - name: "gpu-3.11-2.2-1-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.3-1-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -125,12 +119,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.11-2.2-2-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+        - name: "gpu-3.11-2.5-1-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
-          gpu_num: 2
+          gpu_num: 1
         - name: "gpu-3.11-2.3-2-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -143,12 +137,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.11-2.2-4-gpu"
-          container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
+        - name: "gpu-3.11-2.5-2-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
-          gpu_num: 4
+          gpu_num: 2
         - name: "gpu-3.11-2.3-4-gpu"
           container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -161,6 +155,12 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 4
+        - name: "gpu-3.11-2.5-4-gpu"
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
+          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
+          pytest_command: "coverage run -m pytest"
+          composer_package_name: "mosaicml"
+          gpu_num: 4
     steps:
     - name: Checkout code
       uses: actions/checkout@v3
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 38ebe9d2c7..755e85ad00 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -16,10 +16,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.11-2.2
-          container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.3
           container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
@@ -28,8 +24,12 @@ jobs:
           container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
           pytest_command: coverage run -m pytest
+        - name: cpu-3.11-2.5
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+          markers: not daily and not remote and not gpu and not doctest
+          pytest_command: coverage run -m pytest
         - name: cpu-doctest
-          container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
     steps:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 447f824e67..d3f1e8e90e 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -1,6 +1,6 @@
 name: PR GPU tests
 on:
-  pull_request_target:
+  pull_request:
   workflow_dispatch:
 # Cancel old runs when a new commit is pushed to the same branch if not on main
 # or dev
@@ -15,8 +15,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-1
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-1
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -44,8 +44,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-2
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-2
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
@@ -74,8 +74,8 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: gpu-3.11-2.4-4
-          container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+        - name: gpu-3.11-2.5-4
+          container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
           markers: not daily and not remote and gpu and (doctest or not doctest)
           pytest_command: coverage run -m pytest
           composer_package_name: mosaicml
diff --git a/composer/checkpoint/state_dict.py b/composer/checkpoint/state_dict.py
index 0c9e1606d2..9a843b14cc 100644
--- a/composer/checkpoint/state_dict.py
+++ b/composer/checkpoint/state_dict.py
@@ -88,7 +88,7 @@ def get_model_state_dict(
             log.debug('Calling model.state_dict() for non-FSDP model...')
             model_state_dict = model.state_dict()
         if isinstance(model, DistributedDataParallel):
-            nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')
+            nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')  # type: ignore
 
     if include_keys is not None:
         model_state_dict = _extract_keys_from_state_dict(model_state_dict, include_keys)
diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index acf7f7d10f..72550c12e2 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -916,7 +916,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
         peft_config.save_pretrained(str(output_folder))
 
     weights_state_dict = composer_state_dict['state']['model']
-    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
+    torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')  # type: ignore
 
     # NOTE: This only works for default adapter name, not multiple adapters
     if peft_config is not None:
diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 77c4d733f7..fd7a6c9df8 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -106,7 +106,13 @@ def patch_pytorch():
     elif version.parse(torch.__version__) < version.parse('2.4.1'):
         # Monkey patch for torch < 2.4.1 ie torch == 2.4.0
 
-        # No monkeypatches!
+        # No monkeypatches besides unshard (below)!
+        pass
+
+    elif version.parse(torch.__version__) < version.parse('2.5.1'):
+        # Monkey patch for torch < 2.5.1 ie torch == 2.5.0
+
+        # No monkeypatches besides unshard (below)!
         pass
 
 
@@ -1046,3 +1052,52 @@ def unshard_with_sync(self):
             raise RuntimeError('CUDA out of memory encountered on a different rank')
         padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
         self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+if version.parse(torch.__version__) >= version.parse('2.5.0') and version.parse(
+        torch.__version__,
+) < version.parse('2.5.1'):
+
+    # Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks
+    from torch.distributed.fsdp._flat_param import FlatParamHandle
+    original_unshard = FlatParamHandle.unshard
+
+    @no_type_check
+    def unshard_with_sync(self):
+        """Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`.
+
+        This prevents deadlocks when some ranks OOM after the alloc call and others do not.
+        This is a patched method from pytorch, meant to be called when automicrobatching
+        turns on hooks in its search process for the optimal non-OOMing microbatch size.
+        This includes all-gathering the flat parameter
+        and switching to using the unsharded flat parameter. If the handle does
+        not need unsharding, then this only switches to using the unsharded
+        flat parameter. For ``NO_SHARD``, this is a no-op.
+        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
+        mixed precision, then the parameter is forced to full precision.
+        """
+        if not self.needs_unshard():
+            # Even when not needing an unshard, we should switch to using
+            # the unsharded flat parameter
+            unsharded_flat_param = (
+                self._get_padded_unsharded_flat_param()
+                if self.uses_sharded_strategy
+                else self.flat_param
+            )
+            self._use_unsharded_flat_param(unsharded_flat_param)
+            return
+        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+
+        # Check if any other rank hit an OOM
+        found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX')
+        found_cuda_oom = found_cuda_oom_tensor.item()
+        # Signal current rank is still in batch
+        all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)
+
+        dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN')
+
+        if found_cuda_oom == 1:
+            raise RuntimeError('CUDA out of memory encountered on a different rank')
+        padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
+        self._use_unsharded_flat_param(padded_unsharded_flat_param)
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index e17f5cf7a6..db7752f879 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -2317,9 +2317,11 @@ def fit(
             self.state.max_duration = duration + self.state.timestamp.get(duration.unit)
 
         # Raise error if callig fit with SGD
-        if type(
-            self.state.optimizers[0],
-        ) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0'):
+        if (
+            type(self.state.optimizers[0]) == torch.optim.SGD and
+            version.parse(torch.__version__) >= version.parse('2.4.0') and
+            version.parse(torch.__version__) < version.parse('2.5.0')
+        ):
             raise ValueError(
                 'PyTorch 2.4 breaks (distributed) checkpointing with SGD. '
                 'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW, '
diff --git a/docker/README.md b/docker/README.md
index fd68d04951..ca047829ad 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                                                                                                                                                          |
 |----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                 |
-| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.5.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04`                 |
+| Ubuntu 20.04   | Base     | 2.5.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.5.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04`             |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04`                                                                                        |
+| Ubuntu 20.04   | Base     | 2.4.1             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws`                                                                                |
+| Ubuntu 20.04   | Base     | 2.4.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04`                                                                                            |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                                                                        |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                                                                |
 | Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                                                                            |
-| Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04`                                                                                        |
-| Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws`                                                                                |
-| Ubuntu 20.04   | Base     | 2.2.2             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04`                                                                                            |
 <!-- END_PYTORCH_BUILD_MATRIX -->
 
 **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 65b8e747a1..b3676f2012 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -2,129 +2,103 @@
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-1-cu124
+  IMAGE_NAME: torch-2-5-0-cu124
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.1
+  PYTORCH_VERSION: 2.5.0
   TAGS:
-  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   - ghcr.io/databricks-mosaic/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.1
+  TORCHVISION_VERSION: 0.20.0
 - AWS_OFI_NCCL_VERSION: v1.11.0-aws
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   CUDA_VERSION: 12.4.1
-  IMAGE_NAME: torch-2-4-1-cu124-aws
+  IMAGE_NAME: torch-2-5-0-cu124-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.1
+  PYTORCH_VERSION: 2.5.0
   TAGS:
-  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
-  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws
+  - ghcr.io/databricks-mosaic/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   - ghcr.io/databricks-mosaic/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.1
+  TORCHVISION_VERSION: 0.20.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-4-1-cpu
+  IMAGE_NAME: torch-2-5-0-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.1
+  PYTORCH_VERSION: 2.5.0
   TAGS:
-  - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   - ghcr.io/databricks-mosaic/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.19.1
+  TORCHVISION_VERSION: 0.20.0
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-1-cu124
   MOFED_VERSION: latest-23.10
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: v1.11.0-aws
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121-aws
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-1-cu124-aws
   MOFED_VERSION: ''
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
-  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-3-1-cpu
+  IMAGE_NAME: torch-2-4-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.1
   TAGS:
-  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121
+  IMAGE_NAME: torch-2-3-1-cu121
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -143,16 +117,16 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: v1.11.0-aws
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121-aws
+  IMAGE_NAME: torch-2-3-1-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -171,27 +145,27 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
-  - ghcr.io/databricks-mosaic/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-2-2-cpu
+  IMAGE_NAME: torch-2-3-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
-  - ghcr.io/databricks-mosaic/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+  - ghcr.io/databricks-mosaic/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.25.0
@@ -202,14 +176,14 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.1
+  PYTORCH_VERSION: 2.5.0
   TAGS:
   - mosaicml/composer:0.25.0
   - ghcr.io/databricks-mosaic/composer:0.25.0
   - mosaicml/composer:latest
   - ghcr.io/databricks-mosaic/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.1
+  TORCHVISION_VERSION: 0.20.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.25.0
@@ -220,11 +194,11 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.4.1
+  PYTORCH_VERSION: 2.5.0
   TAGS:
   - mosaicml/composer:0.25.0_cpu
   - ghcr.io/databricks-mosaic/composer:0.25.0_cpu
   - mosaicml/composer:latest_cpu
   - ghcr.io/databricks-mosaic/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.19.1
+  TORCHVISION_VERSION: 0.20.0
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index a3336a3d19..4a931ec2e7 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -20,16 +20,16 @@
 import yaml
 
 PRODUCTION_PYTHON_VERSION = '3.11'
-PRODUCTION_PYTORCH_VERSION = '2.4.1'
+PRODUCTION_PYTORCH_VERSION = '2.5.0'
 
 
 def _get_torchvision_version(pytorch_version: str):
+    if pytorch_version == '2.5.0':
+        return '0.20.0'
     if pytorch_version == '2.4.1':
         return '0.19.1'
     if pytorch_version == '2.3.1':
         return '0.18.1'
-    if pytorch_version == '2.2.2':
-        return '0.17.2'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -45,12 +45,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
+    if pytorch_version == '2.5.0':
+        return '12.4.1'
     if pytorch_version == '2.4.1':
         return '12.4.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
-    if pytorch_version == '2.2.2':
-        return '12.1.1'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -180,7 +180,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.4.1'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
+    python_pytorch_versions = [('3.11', '2.5.0'), ('3.11', '2.4.1'), ('3.11', '2.3.1')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS
diff --git a/setup.py b/setup.py
index 1913efeb36..4782238eca 100644
--- a/setup.py
+++ b/setup.py
@@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
     'tqdm>=4.62.3,<5',
     'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.14.0,<0.19.2',
-    'torch>=2.2.0,<2.4.2',
+    'torchvision>=0.18.0,<0.20.1',
+    'torch>=2.3.0,<2.5.1',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.2.0',
     'psutil>=5.8.0,<7',
diff --git a/tests/checkpoint/helpers.py b/tests/checkpoint/helpers.py
index b77741ae46..52838c9aa5 100644
--- a/tests/checkpoint/helpers.py
+++ b/tests/checkpoint/helpers.py
@@ -8,7 +8,7 @@
 from packaging import version
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import CPUOffload
-from torch.optim import adam
+from torch.optim import Adam
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import DataLoader
 
@@ -183,7 +183,7 @@ def init_optimizer(
     inputs = torch.randn(batch_size, num_features, device=device)
     targets = torch.randint(low=0, high=num_classes, size=(batch_size,), device=device, dtype=torch.long)
     batch = (inputs, targets) if use_composer_model else inputs
-    optimizer = adam.Adam(model.parameters())
+    optimizer = Adam(model.parameters())
     outputs = model(batch)
     loss = loss_fn(outputs, targets)
     loss.backward()
diff --git a/tests/trainer/test_checkpoint.py b/tests/trainer/test_checkpoint.py
index d2679c2868..3e93ce56b3 100644
--- a/tests/trainer/test_checkpoint.py
+++ b/tests/trainer/test_checkpoint.py
@@ -417,8 +417,11 @@ def test_checkpoint_saver_properly_constructed(
     # See https://github.com/pytorch/pytorch/issues/133415
     @pytest.mark.xfail
     @pytest.mark.skipif(
-        version.parse(torch.__version__) < version.parse('2.4.0'),
-        reason='Test only applies to PyTorch 2.4+',
+        (
+            version.parse(torch.__version__) < version.parse('2.4.0') or
+            version.parse(torch.__version__) >= version.parse('2.5.0')
+        ),
+        reason='Test only applies to PyTorch 2.4.x',
     )
     def test_sgd_checkpoint(
         self,