From 3ff6292119209868eeeba590a661b6a4e92b2af3 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:14:53 -0800
Subject: [PATCH 01/68] Added doc for nvdec

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 545ddf9c..2407915d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -11,7 +11,7 @@ defaults:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: linux.g5.4xlarge.nvidia.gpu
     strategy:
       fail-fast: false
     steps:

From 1fd5a10d1ec32eae0a5f44d2c063760cb6bc65ef Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:18:16 -0800
Subject: [PATCH 02/68] .

---
 .github/workflows/docs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 2407915d..701bc54a 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,8 +30,8 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
-          conda install "ffmpeg=7.0.1" pkg-config -c conda-forge
+          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
+          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From fa3e3b962f381b4497831e760d66bed1ae42f721 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:20:47 -0800
Subject: [PATCH 03/68] .

---
 .github/workflows/cpp_tests.yaml        | 2 +-
 .github/workflows/linux_cuda_wheel.yaml | 2 +-
 .github/workflows/linux_wheel.yaml      | 2 +-
 .github/workflows/macos_wheel.yaml      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cpp_tests.yaml b/.github/workflows/cpp_tests.yaml
index b2b19a78..453f5bc2 100644
--- a/.github/workflows/cpp_tests.yaml
+++ b/.github/workflows/cpp_tests.yaml
@@ -3,7 +3,7 @@ name: CPP tests
 on:
   push:
     branches: [ main ]
-  pull_request:
+  # pull_request:
 
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 915c5236..7bb57f76 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test Linux CUDA wheels
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - nightly
diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
index 38f25733..5cc75c9a 100644
--- a/.github/workflows/linux_wheel.yaml
+++ b/.github/workflows/linux_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test Linux wheel
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - nightly
diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
index ef637194..45ccdb4d 100644
--- a/.github/workflows/macos_wheel.yaml
+++ b/.github/workflows/macos_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test MacOS wheel
 
 on:
-  pull_request:
+  #pull_request:
   push:
     branches:
       - nightly

From 36a54209d7b6223e9366dba1312cdb38e1bd2090 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:24:41 -0800
Subject: [PATCH 04/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 701bc54a..1e3ced08 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From f49baca49a5bea34781d81f2da178ccb89f4393f Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:25:30 -0800
Subject: [PATCH 05/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 1e3ced08..6424d508 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From f087a913830abc9eac762a7d6760fbdcdaf7a10a Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 12:26:15 -0800
Subject: [PATCH 06/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 6424d508..83f462e3 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake "nvidia/label/cuda-12.4.0::cuda-toolkit" "nvidia/label/cuda-12.4.0::cuda-runtime" -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 5092418ebbdeefd17b086fa5d9480187b34f770f Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 13:59:14 -0800
Subject: [PATCH 07/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 83f462e3..65fb4e7b 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake "nvidia/label/cuda-12.4.0::cuda-toolkit" "nvidia/label/cuda-12.4.0::cuda-runtime" -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake cudatoolkit==12.4.0 -c nvidia -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 243e2ca09005f5d986019e7b746df256d41a796b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:10:16 -0800
Subject: [PATCH 08/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 65fb4e7b..1e3ced08 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake cudatoolkit==12.4.0 -c nvidia -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 7c6c033a9d2d0d6b6b9dcd85f8518b88f4102537 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:15:37 -0800
Subject: [PATCH 09/68] .

---
 .github/workflows/docs.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 1e3ced08..2db1d19b 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,8 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install "ffmpeg=7.0.1" pkg-config cmake nvidia/label/cuda-12.4.0::cuda-toolkit -c conda-forge
+          conda install nvidia/label/cuda-12.4.0::cuda-toolkit
+          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From e40ec7a9e973bc5ed16f7f71171a5dafd408110f Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:28:57 -0800
Subject: [PATCH 10/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 2db1d19b..e84d36a0 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install nvidia/label/cuda-12.4.0::cuda-toolkit
+          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From bb4bff96d7ccc203d228193319db61681e8c31fd Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:35:33 -0800
Subject: [PATCH 11/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index e84d36a0..24d768eb 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime
+          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvrtc-dev
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From e8a5b07488871be8ae10bf908b5554e0585a7a2a Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:42:05 -0800
Subject: [PATCH 12/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 24d768eb..ed72440e 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
-          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-runtime nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvrtc-dev
+          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From c9d54a4a9cdc3b37756b5f100b47cb925958caef Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 14:48:47 -0800
Subject: [PATCH 13/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index ed72440e..57c3006d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,9 +30,9 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
+          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From fb633e439601a4d953d236550a8243a077a0c48b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:31:17 -0800
Subject: [PATCH 14/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 57c3006d..f9e37858 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::libnpp
+          conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           ffmpeg -version

From 9e334cd96783b24ae1ae1630a5ac63894a8ba8b2 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:37:24 -0800
Subject: [PATCH 15/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index f9e37858..db2e0717 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::libnpp
+          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           ffmpeg -version

From c107e02c9c7b41b357df6e23b9639f1df98a4bf0 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:44:46 -0800
Subject: [PATCH 16/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index db2e0717..84d3de8f 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -23,7 +23,7 @@ jobs:
           auto-update-conda: true
           miniconda-version: "latest"
           activate-environment: test
-          python-version: '3.12'
+          python-version: '3.9'
       - name: Update pip
         run: python -m pip install --upgrade pip
       - name: Install dependencies and FFmpeg

From 885c43fc3f41a6aeb42b0c2fc8344697cc147d67 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:45:18 -0800
Subject: [PATCH 17/68] .

---
 .github/workflows/docs.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 84d3de8f..4556c419 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -36,6 +36,8 @@ jobs:
           ffmpeg -version
       - name: Build and install torchcodec
         run: |
+          find $CONDA_PREFIX/lib
+          find $CONDA_PREFIX/lib64
           python -m pip install -e ".[dev]" --no-build-isolation -vvv
       - name: Install doc dependencies
         run: |

From dd937c68b9ab1097f30b6138aab444b04b2dda31 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:45:37 -0800
Subject: [PATCH 18/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 4556c419..0d413025 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -38,7 +38,7 @@ jobs:
         run: |
           find $CONDA_PREFIX/lib
           find $CONDA_PREFIX/lib64
-          python -m pip install -e ".[dev]" --no-build-isolation -vvv
+          ENABLE_CUDA=1 python -m pip install -e ".[dev]" --no-build-isolation -vvv
       - name: Install doc dependencies
         run: |
           cd docs

From bab07dbc651013803b72bef7eefcb2aa11be062c Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 16:53:46 -0800
Subject: [PATCH 19/68] .

---
 .github/workflows/docs.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 0d413025..f1ef63c0 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -36,8 +36,8 @@ jobs:
           ffmpeg -version
       - name: Build and install torchcodec
         run: |
-          find $CONDA_PREFIX/lib
-          find $CONDA_PREFIX/lib64
+          find $CONDA_PREFIX/lib -type f -iname libnvtoolsext\*.so | xargs ldd || true
+          find $CONDA_PREFIX/lib64 -type f -iname libnvtoolsext\*.so | xargs ldd || true
           ENABLE_CUDA=1 python -m pip install -e ".[dev]" --no-build-isolation -vvv
       - name: Install doc dependencies
         run: |

From 60b06e1f0c9a2b484a7e11d97f8c901300a4899c Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 17:01:58 -0800
Subject: [PATCH 20/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index f1ef63c0..cdd7cd9d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::libnpp
+          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart conda install nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           ffmpeg -version

From 904bfa33437cb5f934302937f18993f619dccb8f Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 17:28:17 -0800
Subject: [PATCH 21/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index cdd7cd9d..1aa556ff 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart conda install nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
+          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
           ffmpeg -version

From 75e76ee559c7c762d777a71037c0cebdf86bd1e7 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 17:37:31 -0800
Subject: [PATCH 22/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 1aa556ff..87d16b80 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -32,7 +32,7 @@ jobs:
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
-          python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
+          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 16218ac5be71abff55007cad2ef4b54df0a42a95 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 17:46:12 -0800
Subject: [PATCH 23/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 87d16b80..b038352d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,8 +31,8 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
-          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
+          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From e8f0128c56e82d9b77c10fec2c4641aacefe6128 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 17:55:21 -0800
Subject: [PATCH 24/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index b038352d..7017c121 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,8 +30,8 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
           conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
+          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From 9c36f4ecf095aa9640194010cc156fb63a64d29d Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 18:37:57 -0800
Subject: [PATCH 25/68] .

---
 .github/workflows/docs.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 7017c121..ce20b436 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -38,6 +38,7 @@ jobs:
         run: |
           find $CONDA_PREFIX/lib -type f -iname libnvtoolsext\*.so | xargs ldd || true
           find $CONDA_PREFIX/lib64 -type f -iname libnvtoolsext\*.so | xargs ldd || true
+          find $CONDA_PREFIX -name cuda_cmake_macros.h
           ENABLE_CUDA=1 python -m pip install -e ".[dev]" --no-build-isolation -vvv
       - name: Install doc dependencies
         run: |

From 24064356227fb8f7909d6c78f288e8a03d8d82c3 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 18:46:39 -0800
Subject: [PATCH 26/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index ce20b436..74017863 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -32,7 +32,7 @@ jobs:
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
           conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
-          conda install "ffmpeg=7.0.1" pkg-config cmake -c conda-forge
+          conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 7b78be320de9d769e799555f228fc2b374b8dc5e Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 18:49:37 -0800
Subject: [PATCH 27/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 74017863..6264b85c 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
-          conda install nvidia/label/cuda-12.4.1::cuda nvidia/label/cuda-12.4.1::cuda-toolkit nvidia/label/cuda-12.4.1::cuda-cudart nvidia/label/cuda-12.4.1::cuda-nvtx nvidia/label/cuda-12.4.1::libnpp
+          conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From 20c6fba6e96cce7e6e79d8d7b193e7f21afda32e Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 19:04:47 -0800
Subject: [PATCH 28/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 6264b85c..10c599d8 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
           conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
-          conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
+          # conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
           ffmpeg -version
       - name: Build and install torchcodec

From 7630fddf07a8f7b64b7efd548094fccce6296019 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 19:13:15 -0800
Subject: [PATCH 29/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 10c599d8..32ac23cf 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
+          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 cudatoolkit=12.4 -c pytorch-nightly -c nvidia
           # conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
           ffmpeg -version

From 37bfa5c3935cc512bcd42bcb389db4592380eb2b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Tue, 5 Nov 2024 19:17:00 -0800
Subject: [PATCH 30/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 32ac23cf..dd384025 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -30,7 +30,7 @@ jobs:
         run: |
           # TODO: torchvision and torchaudio shouldn't be needed. They were only added
           #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 cudatoolkit=12.4 -c pytorch-nightly -c nvidia
+          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 cuda-toolkit=12.4 -c pytorch-nightly -c nvidia
           # conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
           conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
           ffmpeg -version

From 24f28432c589060e4c489d890f7e7fa06e2ef562 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 12:58:33 -0800
Subject: [PATCH 31/68] .

---
 .github/workflows/docs.yaml |  11 +-
 packaging/cuda12.4.yaml     | 289 ++++++++++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+), 9 deletions(-)
 create mode 100644 packaging/cuda12.4.yaml

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index dd384025..b4eadcac 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -23,22 +23,15 @@ jobs:
           auto-update-conda: true
           miniconda-version: "latest"
           activate-environment: test
-          python-version: '3.9'
+          python-version: '3.12'
       - name: Update pip
         run: python -m pip install --upgrade pip
       - name: Install dependencies and FFmpeg
         run: |
-          # TODO: torchvision and torchaudio shouldn't be needed. They were only added
-          #  to silence an error as seen in https://github.com/pytorch/torchcodec/issues/203
-          conda install pytorch torchvision torchaudio pytorch-cuda=12.4 cuda-toolkit=12.4 -c pytorch-nightly -c nvidia
-          # conda install nvidia/label/cuda-12.4.0::cuda nvidia/label/cuda-12.4.0::cuda-toolkit nvidia/label/cuda-12.4.0::cuda-cudart nvidia/label/cuda-12.4.0::cuda-nvtx nvidia/label/cuda-12.4.0::libnpp
-          conda install "ffmpeg=7.0.1" pkg-config cmake compilers -c conda-forge
+          conda env update --file packaging/cuda12.4.yaml
           ffmpeg -version
       - name: Build and install torchcodec
         run: |
-          find $CONDA_PREFIX/lib -type f -iname libnvtoolsext\*.so | xargs ldd || true
-          find $CONDA_PREFIX/lib64 -type f -iname libnvtoolsext\*.so | xargs ldd || true
-          find $CONDA_PREFIX -name cuda_cmake_macros.h
           ENABLE_CUDA=1 python -m pip install -e ".[dev]" --no-build-isolation -vvv
       - name: Install doc dependencies
         run: |
diff --git a/packaging/cuda12.4.yaml b/packaging/cuda12.4.yaml
new file mode 100644
index 00000000..7666f0cb
--- /dev/null
+++ b/packaging/cuda12.4.yaml
@@ -0,0 +1,289 @@
+name: cuda4
+channels:
+  - pytorch-nightly
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_kmp_llvm
+  - aom=3.9.1=hac33072_0
+  - archspec=0.2.3=pyhd3eb1b0_0
+  - blas=1.0=mkl
+  - boltons=23.0.0=py312h06a4308_0
+  - brotli-python=1.0.9=py312h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - c-ares=1.34.2=heb4867d_0
+  - ca-certificates=2024.9.24=h06a4308_0
+  - cairo=1.18.0=hebfffa5_3
+  - certifi=2024.8.30=py312h06a4308_0
+  - cffi=1.17.1=py312h1fdaa30_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cmake=3.30.5=hf9cb763_0
+  - conda=24.9.2=py312h7900ff3_0
+  - conda-libmamba-solver=24.9.0=pyhd3eb1b0_0
+  - conda-package-handling=2.3.0=py312h06a4308_0
+  - conda-package-streaming=0.10.0=py312h06a4308_0
+  - cuda=12.4.0=0
+  - cuda-cccl=12.4.127=0
+  - cuda-command-line-tools=12.4.1=0
+  - cuda-compiler=12.6.2=0
+  - cuda-cudart=12.4.127=0
+  - cuda-cudart-dev=12.4.127=0
+  - cuda-cudart-static=12.4.127=0
+  - cuda-cuobjdump=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-cupti-static=12.4.127=0
+  - cuda-cuxxfilt=12.4.127=0
+  - cuda-demo-suite=12.4.127=0
+  - cuda-documentation=12.4.127=0
+  - cuda-driver-dev=12.4.127=0
+  - cuda-gdb=12.4.127=0
+  - cuda-libraries=12.4.1=0
+  - cuda-libraries-dev=12.6.0=0
+  - cuda-libraries-static=12.4.1=0
+  - cuda-nsight=12.4.127=0
+  - cuda-nvcc=12.4.131=0
+  - cuda-nvdisasm=12.4.127=0
+  - cuda-nvml-dev=12.4.127=0
+  - cuda-nvprof=12.4.127=0
+  - cuda-nvprune=12.4.127=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvrtc-dev=12.4.127=0
+  - cuda-nvrtc-static=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-nvvp=12.4.127=0
+  - cuda-opencl=12.4.127=0
+  - cuda-opencl-dev=12.4.127=0
+  - cuda-profiler-api=12.4.127=0
+  - cuda-runtime=12.4.0=0
+  - cuda-sanitizer-api=12.4.127=0
+  - cuda-toolkit=12.4.0=0
+  - cuda-tools=12.4.1=0
+  - cuda-version=11.8=h70ddcb2_3
+  - cuda-visual-tools=12.6.0=0
+  - cudatoolkit=11.8.0=h4ba93d1_13
+  - cudnn=9.3.0.75=hc149ed2_0
+  - dav1d=1.2.1=hd590300_0
+  - distro=1.9.0=py312h06a4308_0
+  - expat=2.6.3=h6a678d5_0
+  - ffmpeg=7.1.0=gpl_h2e64a5a_503
+  - filelock=3.13.1=py312h06a4308_0
+  - fmt=9.1.0=hdb19cb5_1
+  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
+  - font-ttf-inconsolata=3.000=h77eed37_0
+  - font-ttf-source-code-pro=2.038=h77eed37_0
+  - font-ttf-ubuntu=0.83=h77eed37_3
+  - fontconfig=2.15.0=h7e30c49_1
+  - fonts-conda-ecosystem=1=0
+  - fonts-conda-forge=1=0
+  - freetype=2.12.1=h267a509_2
+  - fribidi=1.0.10=h36c2ea0_0
+  - frozendict=2.4.2=py312h06a4308_0
+  - fsspec=2024.10.0=pyhff2d567_0
+  - gdk-pixbuf=2.42.12=hb9ae30d_0
+  - gds-tools=1.9.1.3=0
+  - giflib=5.2.2=h5eee18b_0
+  - gmp=6.3.0=hac33072_2
+  - gnutls=3.6.15=he1e5248_0
+  - graphite2=1.3.14=h295c915_1
+  - harfbuzz=9.0.0=hda332d3_1
+  - icu=75.1=he02047a_0
+  - idna=3.7=py312h06a4308_0
+  - intel-openmp=2022.0.1=h06a4308_3633
+  - jinja2=3.1.4=py312h06a4308_1
+  - jsonpatch=1.33=py312h06a4308_1
+  - jsonpointer=2.1=pyhd3eb1b0_0
+  - kaldi=5.5.1112=cpu_hd7b63f8_5
+  - kernel-headers_linux-64=3.10.0=he073ed8_18
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.21.3=h659f571_0
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.16=hb7c19ff_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - lerc=4.0.0=h27087fc_0
+  - libabseil=20240722.0=cxx17_h5888daf_1
+  - libarchive=3.7.4=hfca40fe_0
+  - libass=0.17.3=h1dc1e6a_0
+  - libblas=3.9.0=20_linux64_mkl
+  - libcblas=3.9.0=20_linux64_mkl
+  - libcublas=12.4.5.8=0
+  - libcublas-dev=12.4.5.8=0
+  - libcublas-static=12.4.5.8=0
+  - libcufft=11.2.1.3=0
+  - libcufft-dev=11.2.1.3=0
+  - libcufft-static=11.2.1.3=0
+  - libcufile=1.9.1.3=0
+  - libcufile-dev=1.9.1.3=0
+  - libcufile-static=1.9.1.3=0
+  - libcurand=10.3.5.147=0
+  - libcurand-dev=10.3.5.147=0
+  - libcurand-static=10.3.5.147=0
+  - libcurl=8.11.0=hbbe4b11_0
+  - libcusolver=11.6.1.9=0
+  - libcusolver-dev=11.6.1.9=0
+  - libcusolver-static=11.6.1.9=0
+  - libcusparse=12.3.1.170=0
+  - libcusparse-dev=12.3.1.170=0
+  - libcusparse-static=12.3.1.170=0
+  - libdeflate=1.22=hb9d3cd8_0
+  - libdrm=2.4.123=hb9d3cd8_0
+  - libedit=3.1.20230828=h5eee18b_0
+  - libegl=1.7.0=ha4b6fd6_1
+  - libev=4.33=h7f8727e_1
+  - libexpat=2.6.3=h5888daf_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=14.2.0=h77fa898_1
+  - libgcc-ng=14.2.0=h69a702a_1
+  - libgfortran=14.2.0=h69a702a_1
+  - libgfortran5=14.2.0=hd5240d6_1
+  - libgl=1.7.0=ha4b6fd6_1
+  - libglib=2.82.2=h2ff4ddf_0
+  - libglvnd=1.7.0=ha4b6fd6_1
+  - libglx=1.7.0=ha4b6fd6_1
+  - libgomp=14.2.0=h77fa898_1
+  - libhwloc=2.11.1=default_hecaa2ac_1000
+  - libiconv=1.17=hd590300_2
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=3.0.3=h5eee18b_0
+  - liblapack=3.9.0=20_linux64_mkl
+  - liblapacke=3.9.0=20_linux64_mkl
+  - libmagma=2.8.0=hfdb99dd_0
+  - libmagma_sparse=2.8.0=h9ddd185_0
+  - libmamba=1.5.8=hfe524e5_3
+  - libmambapy=1.5.8=py312h2dafd23_3
+  - libnghttp2=1.64.0=h161d5f1_0
+  - libnpp=12.2.5.30=0
+  - libnpp-dev=12.2.5.30=0
+  - libnpp-static=12.2.5.30=0
+  - libnsl=2.0.1=hd590300_0
+  - libnvfatbin=12.4.127=0
+  - libnvfatbin-dev=12.4.127=0
+  - libnvjitlink=12.4.127=0
+  - libnvjitlink-dev=12.4.127=0
+  - libnvjpeg=12.3.1.117=0
+  - libnvjpeg-dev=12.3.1.117=0
+  - libnvjpeg-static=12.3.1.117=0
+  - libopenvino=2024.4.0=hac27bb2_2
+  - libopenvino-auto-batch-plugin=2024.4.0=h4d9b6c2_2
+  - libopenvino-auto-plugin=2024.4.0=h4d9b6c2_2
+  - libopenvino-hetero-plugin=2024.4.0=h3f63f65_2
+  - libopenvino-intel-cpu-plugin=2024.4.0=hac27bb2_2
+  - libopenvino-intel-gpu-plugin=2024.4.0=hac27bb2_2
+  - libopenvino-intel-npu-plugin=2024.4.0=hac27bb2_2
+  - libopenvino-ir-frontend=2024.4.0=h3f63f65_2
+  - libopenvino-onnx-frontend=2024.4.0=h5c8f2c3_2
+  - libopenvino-paddle-frontend=2024.4.0=h5c8f2c3_2
+  - libopenvino-pytorch-frontend=2024.4.0=h5888daf_2
+  - libopenvino-tensorflow-frontend=2024.4.0=h6481b9d_2
+  - libopenvino-tensorflow-lite-frontend=2024.4.0=h5888daf_2
+  - libopus=1.3.1=h5eee18b_1
+  - libpciaccess=0.18=hd590300_0
+  - libpng=1.6.44=hadc24fc_0
+  - libprotobuf=5.28.2=h5b01275_0
+  - librsvg=2.58.4=hc0ffecb_0
+  - libsolv=0.7.30=h3509ff9_0
+  - libsqlite=3.47.0=hadc24fc_1
+  - libssh2=1.11.0=h0841786_0
+  - libstdcxx=14.2.0=hc0a3c3a_1
+  - libstdcxx-ng=14.2.0=h4852527_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.7.0=he137b08_1
+  - libtorch=2.4.1=cuda118_h232d35b_303
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=2.38.1=h0b41bf4_0
+  - libuv=1.49.2=hb9d3cd8_0
+  - libva=2.22.0=h8a09558_1
+  - libvpx=1.14.1=hac33072_0
+  - libwebp=1.4.0=h2c329e2_0
+  - libwebp-base=1.4.0=hd590300_0
+  - libxcb=1.17.0=h8a09558_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libxml2=2.13.4=hb346dea_2
+  - libzlib=1.3.1=hb9d3cd8_2
+  - llvm-openmp=19.1.3=h024ca30_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - lzo=2.10=hd590300_1001
+  - markupsafe=2.1.3=py312h5eee18b_0
+  - menuinst=2.1.2=py312h06a4308_0
+  - mkl=2023.2.0=h84fe81f_50496
+  - mkl-service=2.4.0=py312h5eee18b_1
+  - mkl_fft=1.3.11=py312h5eee18b_0
+  - mkl_random=1.2.8=py312h526ad5a_0
+  - mpmath=1.3.0=py312h06a4308_0
+  - nccl=2.23.4.1=h03a54cd_2
+  - ncurses=6.5=he02047a_1
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py312h06a4308_0
+  - nsight-compute=2024.1.1.4=0
+  - numpy=2.1.3=py312hc5e2394_0
+  - numpy-base=2.1.3=py312h0da6c21_0
+  - ocl-icd=2.3.2=hd590300_1
+  - openfst=1.8.3=h84d6215_3
+  - openh264=2.4.1=h59595ed_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.3.2=hb9d3cd8_0
+  - packaging=24.1=py312h06a4308_0
+  - pango=1.54.0=h4c5309f_1
+  - pcre2=10.44=hba22ea6_2
+  - pillow=11.0.0=py312h7b63e92_0
+  - pip=24.2=py312h06a4308_0
+  - pixman=0.43.2=h59595ed_0
+  - pkg-config=0.29.2=h1bed415_8
+  - platformdirs=3.10.0=py312h06a4308_0
+  - pluggy=1.0.0=py312h06a4308_1
+  - pthread-stubs=0.4=hb9d3cd8_1002
+  - pugixml=1.14=h59595ed_0
+  - pybind11-abi=5=hd3eb1b0_0
+  - pycosat=0.6.6=py312h5eee18b_1
+  - pycparser=2.21=pyhd3eb1b0_0
+  - pysocks=1.7.1=py312h06a4308_0
+  - python=3.12.7=hc5c86c4_0_cpython
+  - python_abi=3.12=5_cp312
+  - pytorch=2.4.1=cuda118_py312h02e3f75_303
+  - pytorch-cuda=12.4=hc786d27_7
+  - pytorch-mutex=1.0=cpu
+  - pyyaml=6.0.2=py312h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - reproc=14.2.4=h6a678d5_2
+  - reproc-cpp=14.2.4=h6a678d5_2
+  - requests=2.32.3=py312h06a4308_0
+  - rhash=1.4.5=hb9d3cd8_0
+  - ruamel.yaml=0.18.6=py312h5eee18b_0
+  - ruamel.yaml.clib=0.2.8=py312h5eee18b_0
+  - setuptools=72.1.0=py312h06a4308_0
+  - sleef=3.7=h1b44611_0
+  - snappy=1.2.1=ha2e4443_0
+  - sqlite=3.47.0=h9eae976_1
+  - svt-av1=2.3.0=h5888daf_0
+  - sympy=1.13.2=py312h06a4308_0
+  - sysroot_linux-64=2.17=h4a8ded7_18
+  - tbb=2021.13.0=h84d6215_0
+  - tk=8.6.13=noxft_h4845f30_101
+  - torchaudio=2.4.1=cuda_118py312h3b1587d_1
+  - torchvision=0.19.1=cuda118py312h9250042_1
+  - tqdm=4.66.5=py312he106c6f_0
+  - truststore=0.8.0=py312h06a4308_0
+  - typing_extensions=4.11.0=py312h06a4308_0
+  - tzdata=2024b=h04d1e81_0
+  - urllib3=2.2.3=py312h06a4308_0
+  - wayland=1.23.1=h3e06ad9_0
+  - wayland-protocols=1.37=hd8ed1ab_0
+  - wheel=0.44.0=py312h06a4308_0
+  - x264=1!164.3095=h166bdaf_2
+  - x265=3.5=h924138e_3
+  - xorg-libice=1.1.1=hb9d3cd8_1
+  - xorg-libsm=1.2.4=he73a12e_1
+  - xorg-libx11=1.8.10=h4f16b4b_0
+  - xorg-libxau=1.0.11=hb9d3cd8_1
+  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
+  - xorg-libxext=1.3.6=hb9d3cd8_0
+  - xorg-libxfixes=6.0.1=hb9d3cd8_0
+  - xorg-libxrender=0.9.11=hb9d3cd8_1
+  - xorg-xorgproto=2024.1=hb9d3cd8_1
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - yaml-cpp=0.8.0=h6a678d5_1
+  - zlib=1.3.1=hb9d3cd8_2
+  - zstandard=0.23.0=py312h2c38b39_0
+  - zstd=1.5.6=ha6fb4c9_0

From 4cb95a24fba6e42a36f2131bdcacd3b2db95dbec Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 12:59:35 -0800
Subject: [PATCH 32/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index b4eadcac..e3bd02c9 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -24,11 +24,11 @@ jobs:
           miniconda-version: "latest"
           activate-environment: test
           python-version: '3.12'
+          environment-file: packaging/cuda12.4.yaml
       - name: Update pip
         run: python -m pip install --upgrade pip
       - name: Install dependencies and FFmpeg
         run: |
-          conda env update --file packaging/cuda12.4.yaml
           ffmpeg -version
       - name: Build and install torchcodec
         run: |

From 4055346774de7f0f149b0f5dcb04ae4e4b789571 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 13:09:41 -0800
Subject: [PATCH 33/68] .

---
 examples/cuda_example.py | 175 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 examples/cuda_example.py

diff --git a/examples/cuda_example.py b/examples/cuda_example.py
new file mode 100644
index 00000000..4fd72e72
--- /dev/null
+++ b/examples/cuda_example.py
@@ -0,0 +1,175 @@
+"""
+Accelerated video decoding with NVDEC
+=====================================
+
+.. _nvdec_tutorial:
+
+**Author**: `Ahmad Sharif <ahmads@meta.com>`__
+
+This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
+with TorchCodec, and how it improves the performance of video decoding.
+"""
+
+######################################################################
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries compiled with HW
+#    acceleration enabled.
+#
+#    Please refer to
+#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
+#    for how to build FFmpeg with HW acceleration.
+#
+
+import torch
+
+print(torch.__version__)
+print(torchaudio.__version__)
+
+######################################################################
+#
+
+import matplotlib.pyplot as plt
+from torchcodec import VideoDecoder
+
+print("Avaialbe GPU:")
+print(torch.cuda.get_device_properties(0))
+
+######################################################################
+#
+# We will use the following video which has the following properties;
+#
+# - Codec: H.264
+# - Resolution: 960x540
+# - FPS: 29.97
+# - Pixel format: YUV420P
+#
+# .. raw:: html
+#
+#    <video style="max-width: 100%" controls>
+#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
+#    </video>
+
+######################################################################
+#
+
+src = torchaudio.utils.download_asset(
+    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
+)
+
+######################################################################
+# Decoding videos with NVDEC
+# --------------------------
+#
+# To use HW video decoder, you need to specify the HW decoder when
+# defining the output video stream by passing ``decoder`` option to
+# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
+#
+
+vd = VideoDecoder(src)
+vd.add_video_stream(0, device="cuda:0")
+frame = vd[0]
+
+######################################################################
+#
+# The video frames are decoded and returned as tensor of NCHW format.
+
+print(frame.data.shape, frame.data.dtype)
+
+######################################################################
+#
+# By default, the decoded frames are sent back to CPU memory, and
+# CPU tensors are created.
+
+print(frame.data.device)
+
+
+######################################################################
+# .. note::
+#
+#    When there are multiple of GPUs available, ``StreamReader`` by
+#    default uses the first GPU. You can change this by providing
+#    ``"gpu"`` option.
+#
+# .. code::
+#
+#    # Video data is sent to CUDA device 0, decoded and
+#    # converted on the same device.
+#    s.add_video_stream(
+#        ...,
+#        decoder="h264_cuvid",
+#        decoder_option={"gpu": "0"},
+#        hw_accel="cuda:0",
+#    )
+#
+# .. note::
+#
+#    ``"gpu"`` option and ``hw_accel`` option can be specified
+#    independently. If they do not match, decoded frames are
+#    transfered to the device specified by ``hw_accell``
+#    automatically.
+#
+# .. code::
+#
+#    # Video data is sent to CUDA device 0, and decoded there.
+#    # Then it is transfered to CUDA device 1, and converted to
+#    # CUDA tensor.
+#    s.add_video_stream(
+#        ...,
+#        decoder="h264_cuvid",
+#        decoder_option={"gpu": "0"},
+#        hw_accel="cuda:1",
+#    )
+
+######################################################################
+# Visualization
+# -------------
+#
+# Let's look at the frames decoded by HW decoder and compare them
+# against equivalent results from software decoders.
+#
+# The following function seeks into the given timestamp and decode one
+# frame with the specificed decoder.
+
+
+def test_decode(decoder: str, seek: float):
+    vd = VideoDecoder(src)
+    return vd.get_frame_played_at(seek)
+
+
+######################################################################
+#
+
+timestamps = [12, 19, 45, 131, 180]
+
+cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
+cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]
+
+
+######################################################################
+#
+# Now we visualize the resutls.
+#
+
+
+def plot_cpu_and_cuda():
+    n_rows = len(timestamps)
+    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
+    for i in range(n_rows):
+        axes[i][0].imshow(cpu_frames[i])
+        axes[i][1].imshow(cuda_frames[i])
+
+    axes[0][0].set_title("Software decoder")
+    axes[0][1].set_title("HW decoder")
+    plt.setp(axes, xticks=[], yticks=[])
+    plt.tight_layout()
+
+
+plot_cpu_and_cuda()
+
+######################################################################
+#
+# They are indistinguishable to the eyes of the author.
+# Feel free to let us know if you spot something. :)
+#

From 63bbb9e581eb27b2953e9b4b39eae909ece31762 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 14:30:49 -0800
Subject: [PATCH 34/68] .

---
 examples/cuda_example.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/cuda_example.py b/examples/cuda_example.py
index 4fd72e72..7a339bbd 100644
--- a/examples/cuda_example.py
+++ b/examples/cuda_example.py
@@ -17,9 +17,6 @@
 #    This tutorial requires FFmpeg libraries compiled with HW
 #    acceleration enabled.
 #
-#    Please refer to
-#    :ref:`Enabling GPU video decoder/encoder <enabling_hw_decoder>`
-#    for how to build FFmpeg with HW acceleration.
 #
 
 import torch

From 51e2308861abca4bbf0bfaa3a495d04faba7202a Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 14:47:30 -0800
Subject: [PATCH 35/68] .

---
 examples/cuda_example.py | 46 ++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/examples/cuda_example.py b/examples/cuda_example.py
index 7a339bbd..9d2b2aed 100644
--- a/examples/cuda_example.py
+++ b/examples/cuda_example.py
@@ -10,6 +10,7 @@
 with TorchCodec, and how it improves the performance of video decoding.
 """
 
+# %%
 ######################################################################
 #
 # .. note::
@@ -18,21 +19,14 @@
 #    acceleration enabled.
 #
 #
-
 import torch
 
 print(torch.__version__)
-print(torchaudio.__version__)
-
-######################################################################
-#
-
-import matplotlib.pyplot as plt
-from torchcodec import VideoDecoder
-
-print("Avaialbe GPU:")
+print(torch.cuda.is_available())
 print(torch.cuda.get_device_properties(0))
 
+
+# %%
 ######################################################################
 #
 # We will use the following video which has the following properties;
@@ -50,11 +44,16 @@
 
 ######################################################################
 #
+import urllib.request
 
-src = torchaudio.utils.download_asset(
-    "tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4"
+video_file = "video.mp4"
+urllib.request.urlretrieve(
+    "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4",
+    video_file,
 )
 
+
+# %%
 ######################################################################
 # Decoding videos with NVDEC
 # --------------------------
@@ -63,25 +62,28 @@
 # defining the output video stream by passing ``decoder`` option to
 # :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
 #
+from torchcodec import VideoDecoder
 
-vd = VideoDecoder(src)
+vd = VideoDecoder(video_file)
 vd.add_video_stream(0, device="cuda:0")
 frame = vd[0]
 
+# %%
 ######################################################################
 #
 # The video frames are decoded and returned as tensor of NCHW format.
 
 print(frame.data.shape, frame.data.dtype)
 
+# %%
 ######################################################################
 #
-# By default, the decoded frames are sent back to CPU memory, and
-# CPU tensors are created.
+# The video frames are left on the GPU memory.
 
 print(frame.data.device)
 
 
+# %%
 ######################################################################
 # .. note::
 #
@@ -119,6 +121,7 @@
 #        hw_accel="cuda:1",
 #    )
 
+
 ######################################################################
 # Visualization
 # -------------
@@ -128,28 +131,20 @@
 #
 # The following function seeks into the given timestamp and decode one
 # frame with the specificed decoder.
+import matplotlib.pyplot as plt
 
 
 def test_decode(decoder: str, seek: float):
-    vd = VideoDecoder(src)
+    vd = VideoDecoder(video_file)
     return vd.get_frame_played_at(seek)
 
 
-######################################################################
-#
-
 timestamps = [12, 19, 45, 131, 180]
 
 cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
 cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]
 
 
-######################################################################
-#
-# Now we visualize the resutls.
-#
-
-
 def plot_cpu_and_cuda():
     n_rows = len(timestamps)
     fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
@@ -165,6 +160,7 @@ def plot_cpu_and_cuda():
 
 plot_cpu_and_cuda()
 
+# %%
 ######################################################################
 #
 # They are indistinguishable to the eyes of the author.

From a9269341e8ec71543533fc0869a944e160d1fdfd Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 15:12:12 -0800
Subject: [PATCH 36/68] .

---
 docs/source/index.rst     |   8 ++
 examples/basic_example.py |  20 +++--
 examples/cuda_example.py  | 168 --------------------------------------
 3 files changed, 19 insertions(+), 177 deletions(-)
 delete mode 100644 examples/cuda_example.py

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1ce569f3..22024888 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,6 +42,14 @@ We achieve these capabilities through:
 
         A simple video decoding example
 
+     .. grid-item-card:: :octicon:`file-code;1em`
+        GPU decoding using TorchCodec
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/basic_cuda_example.html
+        :link-type: url
+
+        A GPU decoding example
+
      .. grid-item-card:: :octicon:`file-code;1em`
         API Reference
         :img-top: _static/img/card-background.svg
diff --git a/examples/basic_example.py b/examples/basic_example.py
index ba85b32f..645df5b0 100644
--- a/examples/basic_example.py
+++ b/examples/basic_example.py
@@ -19,8 +19,9 @@
 # :ref:`creating_decoder`.
 
 from typing import Optional
-import torch
+
 import requests
+import torch
 
 
 # Video source: https://www.pexels.com/video/dog-eating-854132/
@@ -33,16 +34,16 @@
 raw_video_bytes = response.content
 
 
-def plot(frames: torch.Tensor, title : Optional[str] = None):
+def plot(frames: torch.Tensor, title: Optional[str] = None):
     try:
-        from torchvision.utils import make_grid
-        from torchvision.transforms.v2.functional import to_pil_image
         import matplotlib.pyplot as plt
+        from torchvision.transforms.v2.functional import to_pil_image
+        from torchvision.utils import make_grid
     except ImportError:
         print("Cannot plot, please run `pip install torchvision matplotlib`")
         return
 
-    plt.rcParams["savefig.bbox"] = 'tight'
+    plt.rcParams["savefig.bbox"] = "tight"
     fig, ax = plt.subplots()
     ax.imshow(to_pil_image(make_grid(frames)))
     ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
@@ -76,7 +77,7 @@ def plot(frames: torch.Tensor, title : Optional[str] = None):
 # ---------------------------------------
 
 first_frame = decoder[0]  # using a single int index
-every_twenty_frame = decoder[0 : -1 : 20]  # using slices
+every_twenty_frame = decoder[0:-1:20]  # using slices
 
 print(f"{first_frame.shape = }")
 print(f"{first_frame.dtype = }")
@@ -106,9 +107,10 @@ def plot(frames: torch.Tensor, title : Optional[str] = None):
 # The decoder is a normal iterable object and can be iterated over like so:
 
 for frame in decoder:
-    assert (
-        isinstance(frame, torch.Tensor)
-        and frame.shape == (3, decoder.metadata.height, decoder.metadata.width)
+    assert isinstance(frame, torch.Tensor) and frame.shape == (
+        3,
+        decoder.metadata.height,
+        decoder.metadata.width,
     )
 
 # %%
diff --git a/examples/cuda_example.py b/examples/cuda_example.py
deleted file mode 100644
index 9d2b2aed..00000000
--- a/examples/cuda_example.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""
-Accelerated video decoding with NVDEC
-=====================================
-
-.. _nvdec_tutorial:
-
-**Author**: `Ahmad Sharif <ahmads@meta.com>`__
-
-This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
-with TorchCodec, and how it improves the performance of video decoding.
-"""
-
-# %%
-######################################################################
-#
-# .. note::
-#
-#    This tutorial requires FFmpeg libraries compiled with HW
-#    acceleration enabled.
-#
-#
-import torch
-
-print(torch.__version__)
-print(torch.cuda.is_available())
-print(torch.cuda.get_device_properties(0))
-
-
-# %%
-######################################################################
-#
-# We will use the following video which has the following properties;
-#
-# - Codec: H.264
-# - Resolution: 960x540
-# - FPS: 29.97
-# - Pixel format: YUV420P
-#
-# .. raw:: html
-#
-#    <video style="max-width: 100%" controls>
-#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
-#    </video>
-
-######################################################################
-#
-import urllib.request
-
-video_file = "video.mp4"
-urllib.request.urlretrieve(
-    "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4",
-    video_file,
-)
-
-
-# %%
-######################################################################
-# Decoding videos with NVDEC
-# --------------------------
-#
-# To use HW video decoder, you need to specify the HW decoder when
-# defining the output video stream by passing ``decoder`` option to
-# :py:meth:`~torchaudio.io.StreamReader.add_video_stream` method.
-#
-from torchcodec import VideoDecoder
-
-vd = VideoDecoder(video_file)
-vd.add_video_stream(0, device="cuda:0")
-frame = vd[0]
-
-# %%
-######################################################################
-#
-# The video frames are decoded and returned as tensor of NCHW format.
-
-print(frame.data.shape, frame.data.dtype)
-
-# %%
-######################################################################
-#
-# The video frames are left on the GPU memory.
-
-print(frame.data.device)
-
-
-# %%
-######################################################################
-# .. note::
-#
-#    When there are multiple of GPUs available, ``StreamReader`` by
-#    default uses the first GPU. You can change this by providing
-#    ``"gpu"`` option.
-#
-# .. code::
-#
-#    # Video data is sent to CUDA device 0, decoded and
-#    # converted on the same device.
-#    s.add_video_stream(
-#        ...,
-#        decoder="h264_cuvid",
-#        decoder_option={"gpu": "0"},
-#        hw_accel="cuda:0",
-#    )
-#
-# .. note::
-#
-#    ``"gpu"`` option and ``hw_accel`` option can be specified
-#    independently. If they do not match, decoded frames are
-#    transfered to the device specified by ``hw_accell``
-#    automatically.
-#
-# .. code::
-#
-#    # Video data is sent to CUDA device 0, and decoded there.
-#    # Then it is transfered to CUDA device 1, and converted to
-#    # CUDA tensor.
-#    s.add_video_stream(
-#        ...,
-#        decoder="h264_cuvid",
-#        decoder_option={"gpu": "0"},
-#        hw_accel="cuda:1",
-#    )
-
-
-######################################################################
-# Visualization
-# -------------
-#
-# Let's look at the frames decoded by HW decoder and compare them
-# against equivalent results from software decoders.
-#
-# The following function seeks into the given timestamp and decode one
-# frame with the specificed decoder.
-import matplotlib.pyplot as plt
-
-
-def test_decode(decoder: str, seek: float):
-    vd = VideoDecoder(video_file)
-    return vd.get_frame_played_at(seek)
-
-
-timestamps = [12, 19, 45, 131, 180]
-
-cpu_frames = [test_decode(decoder="h264", seek=ts) for ts in timestamps]
-cuda_frames = [test_decode(decoder="h264_cuvid", seek=ts) for ts in timestamps]
-
-
-def plot_cpu_and_cuda():
-    n_rows = len(timestamps)
-    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
-    for i in range(n_rows):
-        axes[i][0].imshow(cpu_frames[i])
-        axes[i][1].imshow(cuda_frames[i])
-
-    axes[0][0].set_title("Software decoder")
-    axes[0][1].set_title("HW decoder")
-    plt.setp(axes, xticks=[], yticks=[])
-    plt.tight_layout()
-
-
-plot_cpu_and_cuda()
-
-# %%
-######################################################################
-#
-# They are indistinguishable to the eyes of the author.
-# Feel free to let us know if you spot something. :)
-#

From 400001a4ff10878d2e0b36cb5dc4a4b3b32ca9df Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Wed, 6 Nov 2024 15:46:17 -0800
Subject: [PATCH 37/68] .

---
 examples/basic_cuda_example.py | 152 +++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 examples/basic_cuda_example.py

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
new file mode 100644
index 00000000..c84372d1
--- /dev/null
+++ b/examples/basic_cuda_example.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Accelerated video decoding with NVDEC
+=====================================
+
+.. _nvdec_tutorial:
+
+**Author**: `Ahmad Sharif <ahmads@meta.com>`__
+
+This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
+with TorchCodec. This decoder is called CUDA decoder in the documentation
+and APIs.
+
+To use the CUDA decoder, you have to have the following installed in your
+environment:
+* NVDEC-enabled FFMPEG
+* libnpp
+* CUDA-enabled pytorch
+
+FFMPEG versions 5, 6 and 7 from conda-forge are built with NVDEC support and
+you can install them by running (for example to install ffmpeg version 7):
+
+.. code-block:: bash
+
+   conda install ffmpeg=7 -c conda-forge
+   conda install libnpp -c nvidia
+"""
+
+# %%
+#
+# .. note::
+#
+#    This tutorial requires FFmpeg libraries compiled with CUDA support.
+#
+#
+import torch
+
+print(f"{torch.__version__=}")
+print(f"{torch.cuda.is_available()=}")
+print(f"{torch.cuda.get_device_properties(0)=}")
+
+
+# %%
+######################################################################
+# Downloading the video
+######################################################################
+#
+# We will use the following video which has the following properties;
+#
+# - Codec: H.264
+# - Resolution: 960x540
+# - FPS: 29.97
+# - Pixel format: YUV420P
+#
+# .. raw:: html
+#
+#    <video style="max-width: 100%" controls>
+#      <source src="https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4" type="video/mp4">
+#    </video>
+import urllib.request
+
+video_file = "video.mp4"
+urllib.request.urlretrieve(
+    "https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4",
+    video_file,
+)
+
+
+# %%
+######################################################################
+# Decoding with CUDA
+######################################################################
+#
+# To use CUDA decoder, you need to pass in a cuda device to the decoder.
+#
+from torchcodec.decoders import VideoDecoder
+
+vd = VideoDecoder(video_file, device="cuda:0")
+frame = vd[0]
+
+# %%
+#
+# The video frames are decoded and returned as tensor of NCHW format.
+
+print(frame.data.shape, frame.data.dtype)
+
+# %%
+#
+# The video frames are left on the GPU memory.
+
+print(frame.data.device)
+
+
+# %%
+######################################################################
+# Visualizing Frames
+######################################################################
+#
+# Let's look at the frames decoded by CUDA decoder and compare them
+# against equivalent results from the CPU decoders.
+import matplotlib.pyplot as plt
+
+
+def get_frames(timestamps: list[float], device: str):
+    decoder = VideoDecoder(video_file, device=device)
+    return [decoder.get_frame_played_at(ts) for ts in timestamps]
+
+
+def get_numpy_images(frames):
+    numpy_images = []
+    for frame in frames:
+        # We transfer to the CPU so they can be visualized by matplotlib.
+        numpy_image = frame.data.to("cpu").permute(1, 2, 0).numpy()
+        numpy_images.append(numpy_image)
+    return numpy_images
+
+
+timestamps = [12, 19, 45, 131, 180]
+cpu_frames = get_frames(timestamps, device="cpu")
+cuda_frames = get_frames(timestamps, device="cuda:0")
+cpu_numpy_images = get_numpy_images(cpu_frames)
+cuda_numpy_images = get_numpy_images(cuda_frames)
+
+
+def plot_cpu_and_cuda():
+    n_rows = len(timestamps)
+    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
+    for i in range(n_rows):
+        axes[i][0].imshow(cpu_numpy_images[i])
+        axes[i][1].imshow(cuda_numpy_images[i])
+
+    axes[0][0].set_title("CPU decoder")
+    axes[0][1].set_title("CUDA decoder")
+    plt.setp(axes, xticks=[], yticks=[])
+    plt.tight_layout()
+
+
+plot_cpu_and_cuda()
+
+# %%
+#
+# They look visually similar to the human eye but there may be subtle
+# differences because CUDA math is not bit-exact to CPU math.
+#
+first_cpu_frame = cpu_frames[0].data.to("cpu")
+first_cuda_frame = cuda_frames[0].data.to("cpu")
+frames_equal = torch.equal(first_cpu_frame, first_cuda_frame)
+print(f"{frames_equal=}")

From ccf95daabb89375fe55947273d412a94afea3c4b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 08:21:03 -0800
Subject: [PATCH 38/68] .

---
 examples/basic_cuda_example.py | 38 +++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index c84372d1..9088d069 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -11,15 +11,33 @@
 
 **Author**: `Ahmad Sharif <ahmads@meta.com>`__
 
-This tutorial shows how to use NVIDIA’s hardware video decoder (NVDEC)
-with TorchCodec. This decoder is called CUDA decoder in the documentation
-and APIs.
+TorchCodec can use Nvidia hardware to speed-up video decoding. An additional benefit
+of doing decoding on the GPU is that the decoded tensor is left on GPU memory to
+benefit from subsequent GPU transforms like scaling or cropping. In this tutorial this
+Nvidia-GPU-accelerated decoding is called "CUDA Decoding".
+
+CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
+
+#. You are deocding a batch of videos that is saturating the CPU
+#. You want to do heavy transforms on the decoded tensors after decoding
+#. You want to free up the CPU to do other work
+
+In some scenarios CUDA Decoding can be slower than CPU Decoding, example:
+
+#. If your GPU is already busy and CPU is not
+#. If you have small resolution videos and the PCI-e transfer latency is large
+#. You want bit-exact results compared to CPU Decoding
+
+It's best to experiment with CUDA Decoding to see if it improves your use-case. With
+TorchCodec you can simply pass in a device parameter to the VideoDecoder class to
+use CUDA Decoding.
+
+In order use CUDA Decoding will need the following installed in your environment:
+
+#. CUDA-enabled pytorch
+#. FFMPEG binaries that support NVDEC-enabled codecs
+#. libnpp
 
-To use the CUDA decoder, you have to have the following installed in your
-environment:
-* NVDEC-enabled FFMPEG
-* libnpp
-* CUDA-enabled pytorch
 
 FFMPEG versions 5, 6 and 7 from conda-forge are built with NVDEC support and
 you can install them by running (for example to install ffmpeg version 7):
@@ -126,7 +144,7 @@ def get_numpy_images(frames):
 cuda_numpy_images = get_numpy_images(cuda_frames)
 
 
-def plot_cpu_and_cuda():
+def plot_cpu_and_cuda_images():
     n_rows = len(timestamps)
     fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
     for i in range(n_rows):
@@ -139,7 +157,7 @@ def plot_cpu_and_cuda():
     plt.tight_layout()
 
 
-plot_cpu_and_cuda()
+plot_cpu_and_cuda_images()
 
 # %%
 #

From 209e746b6c8daaa2d349984a9bd71927aae881e6 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 08:28:05 -0800
Subject: [PATCH 39/68] .

---
 examples/basic_cuda_example.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 9088d069..81b28adb 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -11,16 +11,20 @@
 
 **Author**: `Ahmad Sharif <ahmads@meta.com>`__
 
-TorchCodec can use Nvidia hardware to speed-up video decoding. An additional benefit
-of doing decoding on the GPU is that the decoded tensor is left on GPU memory to
-benefit from subsequent GPU transforms like scaling or cropping. In this tutorial this
-Nvidia-GPU-accelerated decoding is called "CUDA Decoding".
+TorchCodec can use Nvidia hardware to speed-up video decoding. This is called "CUDA Decoding".
+CUDA Decoding can be faster than CPU Decoding for the actual decoding step and for
+subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
+the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before
+running the transform steps. Encoded packets are often much smaller than decoded frames so
+CUDA decoding also uses less PCI-e bandwidth.
 
 CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
 
-#. You are deocding a batch of videos that is saturating the CPU
-#. You want to do heavy transforms on the decoded tensors after decoding
-#. You want to free up the CPU to do other work
+#. You are decoding a large resolution video
+#. You are decoding a large batch of videos that's saturting the CPU
+#. You want to do whole-image transforms like scaling or convolutions on the decoded tensors
+   after decoding
+#. Your CPU is saturated and you want to free it up for other work
 
 In some scenarios CUDA Decoding can be slower than CPU Decoding, example:
 

From 0a8ae5fcd538897336b6692811dea760a89ffefd Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:05:37 -0800
Subject: [PATCH 40/68] .

---
 .github/workflows/docs.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index e3bd02c9..16295292 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -24,15 +24,15 @@ jobs:
           miniconda-version: "latest"
           activate-environment: test
           python-version: '3.12'
-          environment-file: packaging/cuda12.4.yaml
       - name: Update pip
         run: python -m pip install --upgrade pip
-      - name: Install dependencies and FFmpeg
+      - name: Install torchcodec from nightly
         run: |
-          ffmpeg -version
-      - name: Build and install torchcodec
+          pip3 install --pre torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/nightly/cu124
+      - name: Install FFMPEG
         run: |
-          ENABLE_CUDA=1 python -m pip install -e ".[dev]" --no-build-isolation -vvv
+          conda install ffmpeg=7 -c conda-forge
+          ffmpeg -version
       - name: Install doc dependencies
         run: |
           cd docs

From 8864b30e8a66dc93f2189d4d81202feb08418272 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:13:16 -0800
Subject: [PATCH 41/68] .

---
 .github/workflows/docs.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 16295292..a4884456 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -29,8 +29,9 @@ jobs:
       - name: Install torchcodec from nightly
         run: |
           pip3 install --pre torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/nightly/cu124
-      - name: Install FFMPEG
+      - name: Install FFMPEG and other deps
         run: |
+          conda install cuda-nvrtc=12.4 -c nvidia
           conda install ffmpeg=7 -c conda-forge
           ffmpeg -version
       - name: Install doc dependencies

From 936cbd10d71237f5587eb9a66ba0ce98c99c43d4 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:16:35 -0800
Subject: [PATCH 42/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index a4884456..4a98fe8f 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -31,7 +31,7 @@ jobs:
           pip3 install --pre torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/nightly/cu124
       - name: Install FFMPEG and other deps
         run: |
-          conda install cuda-nvrtc=12.4 -c nvidia
+          conda install cuda-nvrtc=12.4 libnpp -c nvidia
           conda install ffmpeg=7 -c conda-forge
           ffmpeg -version
       - name: Install doc dependencies

From 49197b5324a41da5dbf06912bac1dcfce7dacdec Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:23:28 -0800
Subject: [PATCH 43/68] .

---
 docs/source/index.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index d26827ce..3e8ed8e7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -50,11 +50,11 @@ We achieve these capabilities through:
 
         How to sample video clips
 
-      .. grid-item-card:: :octicon:`file-code;1em`
-         GPU decoding using TorchCodec
-         :img-top: _static/img/card-background.svg
-         :link: generated_examples/basic_cuda_example.html
-         :link-type: url
+     .. grid-item-card:: :octicon:`file-code;1em`
+        GPU decoding using TorchCodec
+        :img-top: _static/img/card-background.svg
+        :link: generated_examples/basic_cuda_example.html
+        :link-type: url
 
         A simple example demonstrating Nvidia GPU decoding
 

From 8291aa6e18689a77a5a706662886ec7374f20b3b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:23:46 -0800
Subject: [PATCH 44/68] .

---
 packaging/cuda12.4.yaml | 289 ----------------------------------------
 1 file changed, 289 deletions(-)
 delete mode 100644 packaging/cuda12.4.yaml

diff --git a/packaging/cuda12.4.yaml b/packaging/cuda12.4.yaml
deleted file mode 100644
index 7666f0cb..00000000
--- a/packaging/cuda12.4.yaml
+++ /dev/null
@@ -1,289 +0,0 @@
-name: cuda4
-channels:
-  - pytorch-nightly
-  - nvidia
-  - conda-forge
-  - defaults
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_kmp_llvm
-  - aom=3.9.1=hac33072_0
-  - archspec=0.2.3=pyhd3eb1b0_0
-  - blas=1.0=mkl
-  - boltons=23.0.0=py312h06a4308_0
-  - brotli-python=1.0.9=py312h6a678d5_8
-  - bzip2=1.0.8=h5eee18b_6
-  - c-ares=1.34.2=heb4867d_0
-  - ca-certificates=2024.9.24=h06a4308_0
-  - cairo=1.18.0=hebfffa5_3
-  - certifi=2024.8.30=py312h06a4308_0
-  - cffi=1.17.1=py312h1fdaa30_0
-  - charset-normalizer=3.3.2=pyhd3eb1b0_0
-  - cmake=3.30.5=hf9cb763_0
-  - conda=24.9.2=py312h7900ff3_0
-  - conda-libmamba-solver=24.9.0=pyhd3eb1b0_0
-  - conda-package-handling=2.3.0=py312h06a4308_0
-  - conda-package-streaming=0.10.0=py312h06a4308_0
-  - cuda=12.4.0=0
-  - cuda-cccl=12.4.127=0
-  - cuda-command-line-tools=12.4.1=0
-  - cuda-compiler=12.6.2=0
-  - cuda-cudart=12.4.127=0
-  - cuda-cudart-dev=12.4.127=0
-  - cuda-cudart-static=12.4.127=0
-  - cuda-cuobjdump=12.4.127=0
-  - cuda-cupti=12.4.127=0
-  - cuda-cupti-static=12.4.127=0
-  - cuda-cuxxfilt=12.4.127=0
-  - cuda-demo-suite=12.4.127=0
-  - cuda-documentation=12.4.127=0
-  - cuda-driver-dev=12.4.127=0
-  - cuda-gdb=12.4.127=0
-  - cuda-libraries=12.4.1=0
-  - cuda-libraries-dev=12.6.0=0
-  - cuda-libraries-static=12.4.1=0
-  - cuda-nsight=12.4.127=0
-  - cuda-nvcc=12.4.131=0
-  - cuda-nvdisasm=12.4.127=0
-  - cuda-nvml-dev=12.4.127=0
-  - cuda-nvprof=12.4.127=0
-  - cuda-nvprune=12.4.127=0
-  - cuda-nvrtc=12.4.127=0
-  - cuda-nvrtc-dev=12.4.127=0
-  - cuda-nvrtc-static=12.4.127=0
-  - cuda-nvtx=12.4.127=0
-  - cuda-nvvp=12.4.127=0
-  - cuda-opencl=12.4.127=0
-  - cuda-opencl-dev=12.4.127=0
-  - cuda-profiler-api=12.4.127=0
-  - cuda-runtime=12.4.0=0
-  - cuda-sanitizer-api=12.4.127=0
-  - cuda-toolkit=12.4.0=0
-  - cuda-tools=12.4.1=0
-  - cuda-version=11.8=h70ddcb2_3
-  - cuda-visual-tools=12.6.0=0
-  - cudatoolkit=11.8.0=h4ba93d1_13
-  - cudnn=9.3.0.75=hc149ed2_0
-  - dav1d=1.2.1=hd590300_0
-  - distro=1.9.0=py312h06a4308_0
-  - expat=2.6.3=h6a678d5_0
-  - ffmpeg=7.1.0=gpl_h2e64a5a_503
-  - filelock=3.13.1=py312h06a4308_0
-  - fmt=9.1.0=hdb19cb5_1
-  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
-  - font-ttf-inconsolata=3.000=h77eed37_0
-  - font-ttf-source-code-pro=2.038=h77eed37_0
-  - font-ttf-ubuntu=0.83=h77eed37_3
-  - fontconfig=2.15.0=h7e30c49_1
-  - fonts-conda-ecosystem=1=0
-  - fonts-conda-forge=1=0
-  - freetype=2.12.1=h267a509_2
-  - fribidi=1.0.10=h36c2ea0_0
-  - frozendict=2.4.2=py312h06a4308_0
-  - fsspec=2024.10.0=pyhff2d567_0
-  - gdk-pixbuf=2.42.12=hb9ae30d_0
-  - gds-tools=1.9.1.3=0
-  - giflib=5.2.2=h5eee18b_0
-  - gmp=6.3.0=hac33072_2
-  - gnutls=3.6.15=he1e5248_0
-  - graphite2=1.3.14=h295c915_1
-  - harfbuzz=9.0.0=hda332d3_1
-  - icu=75.1=he02047a_0
-  - idna=3.7=py312h06a4308_0
-  - intel-openmp=2022.0.1=h06a4308_3633
-  - jinja2=3.1.4=py312h06a4308_1
-  - jsonpatch=1.33=py312h06a4308_1
-  - jsonpointer=2.1=pyhd3eb1b0_0
-  - kaldi=5.5.1112=cpu_hd7b63f8_5
-  - kernel-headers_linux-64=3.10.0=he073ed8_18
-  - keyutils=1.6.1=h166bdaf_0
-  - krb5=1.21.3=h659f571_0
-  - lame=3.100=h7b6447c_0
-  - lcms2=2.16=hb7c19ff_0
-  - ld_impl_linux-64=2.40=h12ee557_0
-  - lerc=4.0.0=h27087fc_0
-  - libabseil=20240722.0=cxx17_h5888daf_1
-  - libarchive=3.7.4=hfca40fe_0
-  - libass=0.17.3=h1dc1e6a_0
-  - libblas=3.9.0=20_linux64_mkl
-  - libcblas=3.9.0=20_linux64_mkl
-  - libcublas=12.4.5.8=0
-  - libcublas-dev=12.4.5.8=0
-  - libcublas-static=12.4.5.8=0
-  - libcufft=11.2.1.3=0
-  - libcufft-dev=11.2.1.3=0
-  - libcufft-static=11.2.1.3=0
-  - libcufile=1.9.1.3=0
-  - libcufile-dev=1.9.1.3=0
-  - libcufile-static=1.9.1.3=0
-  - libcurand=10.3.5.147=0
-  - libcurand-dev=10.3.5.147=0
-  - libcurand-static=10.3.5.147=0
-  - libcurl=8.11.0=hbbe4b11_0
-  - libcusolver=11.6.1.9=0
-  - libcusolver-dev=11.6.1.9=0
-  - libcusolver-static=11.6.1.9=0
-  - libcusparse=12.3.1.170=0
-  - libcusparse-dev=12.3.1.170=0
-  - libcusparse-static=12.3.1.170=0
-  - libdeflate=1.22=hb9d3cd8_0
-  - libdrm=2.4.123=hb9d3cd8_0
-  - libedit=3.1.20230828=h5eee18b_0
-  - libegl=1.7.0=ha4b6fd6_1
-  - libev=4.33=h7f8727e_1
-  - libexpat=2.6.3=h5888daf_0
-  - libffi=3.4.4=h6a678d5_1
-  - libgcc=14.2.0=h77fa898_1
-  - libgcc-ng=14.2.0=h69a702a_1
-  - libgfortran=14.2.0=h69a702a_1
-  - libgfortran5=14.2.0=hd5240d6_1
-  - libgl=1.7.0=ha4b6fd6_1
-  - libglib=2.82.2=h2ff4ddf_0
-  - libglvnd=1.7.0=ha4b6fd6_1
-  - libglx=1.7.0=ha4b6fd6_1
-  - libgomp=14.2.0=h77fa898_1
-  - libhwloc=2.11.1=default_hecaa2ac_1000
-  - libiconv=1.17=hd590300_2
-  - libidn2=2.3.4=h5eee18b_0
-  - libjpeg-turbo=3.0.3=h5eee18b_0
-  - liblapack=3.9.0=20_linux64_mkl
-  - liblapacke=3.9.0=20_linux64_mkl
-  - libmagma=2.8.0=hfdb99dd_0
-  - libmagma_sparse=2.8.0=h9ddd185_0
-  - libmamba=1.5.8=hfe524e5_3
-  - libmambapy=1.5.8=py312h2dafd23_3
-  - libnghttp2=1.64.0=h161d5f1_0
-  - libnpp=12.2.5.30=0
-  - libnpp-dev=12.2.5.30=0
-  - libnpp-static=12.2.5.30=0
-  - libnsl=2.0.1=hd590300_0
-  - libnvfatbin=12.4.127=0
-  - libnvfatbin-dev=12.4.127=0
-  - libnvjitlink=12.4.127=0
-  - libnvjitlink-dev=12.4.127=0
-  - libnvjpeg=12.3.1.117=0
-  - libnvjpeg-dev=12.3.1.117=0
-  - libnvjpeg-static=12.3.1.117=0
-  - libopenvino=2024.4.0=hac27bb2_2
-  - libopenvino-auto-batch-plugin=2024.4.0=h4d9b6c2_2
-  - libopenvino-auto-plugin=2024.4.0=h4d9b6c2_2
-  - libopenvino-hetero-plugin=2024.4.0=h3f63f65_2
-  - libopenvino-intel-cpu-plugin=2024.4.0=hac27bb2_2
-  - libopenvino-intel-gpu-plugin=2024.4.0=hac27bb2_2
-  - libopenvino-intel-npu-plugin=2024.4.0=hac27bb2_2
-  - libopenvino-ir-frontend=2024.4.0=h3f63f65_2
-  - libopenvino-onnx-frontend=2024.4.0=h5c8f2c3_2
-  - libopenvino-paddle-frontend=2024.4.0=h5c8f2c3_2
-  - libopenvino-pytorch-frontend=2024.4.0=h5888daf_2
-  - libopenvino-tensorflow-frontend=2024.4.0=h6481b9d_2
-  - libopenvino-tensorflow-lite-frontend=2024.4.0=h5888daf_2
-  - libopus=1.3.1=h5eee18b_1
-  - libpciaccess=0.18=hd590300_0
-  - libpng=1.6.44=hadc24fc_0
-  - libprotobuf=5.28.2=h5b01275_0
-  - librsvg=2.58.4=hc0ffecb_0
-  - libsolv=0.7.30=h3509ff9_0
-  - libsqlite=3.47.0=hadc24fc_1
-  - libssh2=1.11.0=h0841786_0
-  - libstdcxx=14.2.0=hc0a3c3a_1
-  - libstdcxx-ng=14.2.0=h4852527_1
-  - libtasn1=4.19.0=h5eee18b_0
-  - libtiff=4.7.0=he137b08_1
-  - libtorch=2.4.1=cuda118_h232d35b_303
-  - libunistring=0.9.10=h27cfd23_0
-  - libuuid=2.38.1=h0b41bf4_0
-  - libuv=1.49.2=hb9d3cd8_0
-  - libva=2.22.0=h8a09558_1
-  - libvpx=1.14.1=hac33072_0
-  - libwebp=1.4.0=h2c329e2_0
-  - libwebp-base=1.4.0=hd590300_0
-  - libxcb=1.17.0=h8a09558_0
-  - libxcrypt=4.4.36=hd590300_1
-  - libxml2=2.13.4=hb346dea_2
-  - libzlib=1.3.1=hb9d3cd8_2
-  - llvm-openmp=19.1.3=h024ca30_0
-  - lz4-c=1.9.4=h6a678d5_1
-  - lzo=2.10=hd590300_1001
-  - markupsafe=2.1.3=py312h5eee18b_0
-  - menuinst=2.1.2=py312h06a4308_0
-  - mkl=2023.2.0=h84fe81f_50496
-  - mkl-service=2.4.0=py312h5eee18b_1
-  - mkl_fft=1.3.11=py312h5eee18b_0
-  - mkl_random=1.2.8=py312h526ad5a_0
-  - mpmath=1.3.0=py312h06a4308_0
-  - nccl=2.23.4.1=h03a54cd_2
-  - ncurses=6.5=he02047a_1
-  - nettle=3.7.3=hbbd107a_1
-  - networkx=3.2.1=py312h06a4308_0
-  - nsight-compute=2024.1.1.4=0
-  - numpy=2.1.3=py312hc5e2394_0
-  - numpy-base=2.1.3=py312h0da6c21_0
-  - ocl-icd=2.3.2=hd590300_1
-  - openfst=1.8.3=h84d6215_3
-  - openh264=2.4.1=h59595ed_0
-  - openjpeg=2.5.2=he7f1fd0_0
-  - openssl=3.3.2=hb9d3cd8_0
-  - packaging=24.1=py312h06a4308_0
-  - pango=1.54.0=h4c5309f_1
-  - pcre2=10.44=hba22ea6_2
-  - pillow=11.0.0=py312h7b63e92_0
-  - pip=24.2=py312h06a4308_0
-  - pixman=0.43.2=h59595ed_0
-  - pkg-config=0.29.2=h1bed415_8
-  - platformdirs=3.10.0=py312h06a4308_0
-  - pluggy=1.0.0=py312h06a4308_1
-  - pthread-stubs=0.4=hb9d3cd8_1002
-  - pugixml=1.14=h59595ed_0
-  - pybind11-abi=5=hd3eb1b0_0
-  - pycosat=0.6.6=py312h5eee18b_1
-  - pycparser=2.21=pyhd3eb1b0_0
-  - pysocks=1.7.1=py312h06a4308_0
-  - python=3.12.7=hc5c86c4_0_cpython
-  - python_abi=3.12=5_cp312
-  - pytorch=2.4.1=cuda118_py312h02e3f75_303
-  - pytorch-cuda=12.4=hc786d27_7
-  - pytorch-mutex=1.0=cpu
-  - pyyaml=6.0.2=py312h5eee18b_0
-  - readline=8.2=h5eee18b_0
-  - reproc=14.2.4=h6a678d5_2
-  - reproc-cpp=14.2.4=h6a678d5_2
-  - requests=2.32.3=py312h06a4308_0
-  - rhash=1.4.5=hb9d3cd8_0
-  - ruamel.yaml=0.18.6=py312h5eee18b_0
-  - ruamel.yaml.clib=0.2.8=py312h5eee18b_0
-  - setuptools=72.1.0=py312h06a4308_0
-  - sleef=3.7=h1b44611_0
-  - snappy=1.2.1=ha2e4443_0
-  - sqlite=3.47.0=h9eae976_1
-  - svt-av1=2.3.0=h5888daf_0
-  - sympy=1.13.2=py312h06a4308_0
-  - sysroot_linux-64=2.17=h4a8ded7_18
-  - tbb=2021.13.0=h84d6215_0
-  - tk=8.6.13=noxft_h4845f30_101
-  - torchaudio=2.4.1=cuda_118py312h3b1587d_1
-  - torchvision=0.19.1=cuda118py312h9250042_1
-  - tqdm=4.66.5=py312he106c6f_0
-  - truststore=0.8.0=py312h06a4308_0
-  - typing_extensions=4.11.0=py312h06a4308_0
-  - tzdata=2024b=h04d1e81_0
-  - urllib3=2.2.3=py312h06a4308_0
-  - wayland=1.23.1=h3e06ad9_0
-  - wayland-protocols=1.37=hd8ed1ab_0
-  - wheel=0.44.0=py312h06a4308_0
-  - x264=1!164.3095=h166bdaf_2
-  - x265=3.5=h924138e_3
-  - xorg-libice=1.1.1=hb9d3cd8_1
-  - xorg-libsm=1.2.4=he73a12e_1
-  - xorg-libx11=1.8.10=h4f16b4b_0
-  - xorg-libxau=1.0.11=hb9d3cd8_1
-  - xorg-libxdmcp=1.1.5=hb9d3cd8_0
-  - xorg-libxext=1.3.6=hb9d3cd8_0
-  - xorg-libxfixes=6.0.1=hb9d3cd8_0
-  - xorg-libxrender=0.9.11=hb9d3cd8_1
-  - xorg-xorgproto=2024.1=hb9d3cd8_1
-  - xz=5.4.6=h5eee18b_1
-  - yaml=0.2.5=h7b6447c_0
-  - yaml-cpp=0.8.0=h6a678d5_1
-  - zlib=1.3.1=hb9d3cd8_2
-  - zstandard=0.23.0=py312h2c38b39_0
-  - zstd=1.5.6=ha6fb4c9_0

From 4e10d0b1e2b0799a204cbdbd52fb959516d21b9b Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:26:04 -0800
Subject: [PATCH 45/68] .

---
 examples/basic_cuda_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 81b28adb..ba2f375d 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -40,7 +40,7 @@
 
 #. CUDA-enabled pytorch
 #. FFMPEG binaries that support NVDEC-enabled codecs
-#. libnpp
+#. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit)
 
 
 FFMPEG versions 5, 6 and 7 from conda-forge are built with NVDEC support and
@@ -49,7 +49,7 @@
 .. code-block:: bash
 
    conda install ffmpeg=7 -c conda-forge
-   conda install libnpp -c nvidia
+   conda install libnpp cuda-nvrtc -c nvidia
 """
 
 # %%

From b90bc7f0d7c9caa663cbf6a2f80ac4ead1b4f5f0 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:27:45 -0800
Subject: [PATCH 46/68] .

---
 .github/workflows/cpp_tests.yaml        | 2 +-
 .github/workflows/linux_cuda_wheel.yaml | 2 +-
 .github/workflows/macos_wheel.yaml      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cpp_tests.yaml b/.github/workflows/cpp_tests.yaml
index 453f5bc2..b2b19a78 100644
--- a/.github/workflows/cpp_tests.yaml
+++ b/.github/workflows/cpp_tests.yaml
@@ -3,7 +3,7 @@ name: CPP tests
 on:
   push:
     branches: [ main ]
-  # pull_request:
+  pull_request:
 
 concurrency:
   group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 7bb57f76..915c5236 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test Linux CUDA wheels
 
 on:
-  #pull_request:
+  pull_request:
   push:
     branches:
       - nightly
diff --git a/.github/workflows/macos_wheel.yaml b/.github/workflows/macos_wheel.yaml
index 45ccdb4d..ef637194 100644
--- a/.github/workflows/macos_wheel.yaml
+++ b/.github/workflows/macos_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test MacOS wheel
 
 on:
-  #pull_request:
+  pull_request:
   push:
     branches:
       - nightly

From 2ae49ac0670c5fd0afe7b95239ae256cb467a83e Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 10:29:55 -0800
Subject: [PATCH 47/68] .

---
 .github/workflows/linux_wheel.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux_wheel.yaml b/.github/workflows/linux_wheel.yaml
index 5cc75c9a..38f25733 100644
--- a/.github/workflows/linux_wheel.yaml
+++ b/.github/workflows/linux_wheel.yaml
@@ -1,7 +1,7 @@
 name: Build and test Linux wheel
 
 on:
-  #pull_request:
+  pull_request:
   push:
     branches:
       - nightly

From f0444d48a35b515c936c211eb248bdfd51537030 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 11:52:20 -0800
Subject: [PATCH 48/68] .

---
 examples/basic_example.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/basic_example.py b/examples/basic_example.py
index 645df5b0..ba85b32f 100644
--- a/examples/basic_example.py
+++ b/examples/basic_example.py
@@ -19,9 +19,8 @@
 # :ref:`creating_decoder`.
 
 from typing import Optional
-
-import requests
 import torch
+import requests
 
 
 # Video source: https://www.pexels.com/video/dog-eating-854132/
@@ -34,16 +33,16 @@
 raw_video_bytes = response.content
 
 
-def plot(frames: torch.Tensor, title: Optional[str] = None):
+def plot(frames: torch.Tensor, title : Optional[str] = None):
     try:
-        import matplotlib.pyplot as plt
-        from torchvision.transforms.v2.functional import to_pil_image
         from torchvision.utils import make_grid
+        from torchvision.transforms.v2.functional import to_pil_image
+        import matplotlib.pyplot as plt
     except ImportError:
         print("Cannot plot, please run `pip install torchvision matplotlib`")
         return
 
-    plt.rcParams["savefig.bbox"] = "tight"
+    plt.rcParams["savefig.bbox"] = 'tight'
     fig, ax = plt.subplots()
     ax.imshow(to_pil_image(make_grid(frames)))
     ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
@@ -77,7 +76,7 @@ def plot(frames: torch.Tensor, title: Optional[str] = None):
 # ---------------------------------------
 
 first_frame = decoder[0]  # using a single int index
-every_twenty_frame = decoder[0:-1:20]  # using slices
+every_twenty_frame = decoder[0 : -1 : 20]  # using slices
 
 print(f"{first_frame.shape = }")
 print(f"{first_frame.dtype = }")
@@ -107,10 +106,9 @@ def plot(frames: torch.Tensor, title: Optional[str] = None):
 # The decoder is a normal iterable object and can be iterated over like so:
 
 for frame in decoder:
-    assert isinstance(frame, torch.Tensor) and frame.shape == (
-        3,
-        decoder.metadata.height,
-        decoder.metadata.width,
+    assert (
+        isinstance(frame, torch.Tensor)
+        and frame.shape == (3, decoder.metadata.height, decoder.metadata.width)
     )
 
 # %%

From 3d95977e5be4ba620be9f28a036957a67f3313ac Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 14:09:18 -0800
Subject: [PATCH 49/68] .

---
 examples/basic_cuda_example.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index ba2f375d..51a1b1f8 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -21,30 +21,30 @@
 CUDA Decoding can offer speed-up over CPU Decoding in a few scenarios:
 
 #. You are decoding a large resolution video
-#. You are decoding a large batch of videos that's saturting the CPU
+#. You are decoding a large batch of videos that's saturating the CPU
 #. You want to do whole-image transforms like scaling or convolutions on the decoded tensors
    after decoding
 #. Your CPU is saturated and you want to free it up for other work
 
-In some scenarios CUDA Decoding can be slower than CPU Decoding, example:
+Here are situations where CUDA Decoding may not make sense:
 
-#. If your GPU is already busy and CPU is not
-#. If you have small resolution videos and the PCI-e transfer latency is large
 #. You want bit-exact results compared to CPU Decoding
+#. If you have small resolution videos and the PCI-e transfer latency is large
+#. If your GPU is already busy and CPU is not
 
 It's best to experiment with CUDA Decoding to see if it improves your use-case. With
-TorchCodec you can simply pass in a device parameter to the VideoDecoder class to
-use CUDA Decoding.
+TorchCodec you can simply pass in a device parameter to the
+:class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
 
 In order use CUDA Decoding will need the following installed in your environment:
 
 #. CUDA-enabled pytorch
-#. FFMPEG binaries that support NVDEC-enabled codecs
+#. FFmpeg binaries that support NVDEC-enabled codecs
 #. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit)
 
 
-FFMPEG versions 5, 6 and 7 from conda-forge are built with NVDEC support and
-you can install them by running (for example to install ffmpeg version 7):
+FFmpeg versions 5, 6 and 7 from conda-forge are built with NVDEC support and you can
+install them with conda. For example, to install FFmpeg version 7:
 
 .. code-block:: bash
 

From 5cbccd0e6013fcbb0a12ca5934af49ff18d42d46 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 17:20:05 -0800
Subject: [PATCH 50/68] .

---
 examples/basic_cuda_example.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 51a1b1f8..b4431155 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -53,6 +53,9 @@
 """
 
 # %%
+######################################################################
+# Checking if Pytorch has CUDA enabled
+######################################################################
 #
 # .. note::
 #

From bf81cbe1b025e31111efb336522a2e62a38bbe85 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 17:20:26 -0800
Subject: [PATCH 51/68] .

---
 examples/basic_cuda_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index b4431155..c66aa008 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -97,7 +97,7 @@
 
 # %%
 ######################################################################
-# Decoding with CUDA
+# CUDA Decoding using VideoDecoder
 ######################################################################
 #
 # To use CUDA decoder, you need to pass in a cuda device to the decoder.

From 0ca9469dec17995e867b01d3c584f9ec402d100d Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 17:21:48 -0800
Subject: [PATCH 52/68] .

---
 examples/basic_cuda_example.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index c66aa008..179d870a 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -26,6 +26,7 @@
    after decoding
 #. Your CPU is saturated and you want to free it up for other work
 
+
 Here are situations where CUDA Decoding may not make sense:
 
 #. You want bit-exact results compared to CPU Decoding
@@ -36,6 +37,7 @@
 TorchCodec you can simply pass in a device parameter to the
 :class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
 
+
 In order use CUDA Decoding will need the following installed in your environment:
 
 #. CUDA-enabled pytorch
@@ -47,7 +49,7 @@
 install them with conda. For example, to install FFmpeg version 7:
 
 .. code-block:: bash
-
+   # These libraries are needed for CUDA decoding with TorchCodec
    conda install ffmpeg=7 -c conda-forge
    conda install libnpp cuda-nvrtc -c nvidia
 """
@@ -169,7 +171,7 @@ def plot_cpu_and_cuda_images():
 # %%
 #
 # They look visually similar to the human eye but there may be subtle
-# differences because CUDA math is not bit-exact to CPU math.
+# differences because CUDA math is not bit-exact with respect to CPU math.
 #
 first_cpu_frame = cpu_frames[0].data.to("cpu")
 first_cuda_frame = cuda_frames[0].data.to("cpu")

From 64a9ebd9d360bbf4627e1537b4d5e3ced16fb990 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Thu, 7 Nov 2024 17:22:12 -0800
Subject: [PATCH 53/68] .

---
 examples/basic_cuda_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 179d870a..0915f377 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -12,7 +12,7 @@
 **Author**: `Ahmad Sharif <ahmads@meta.com>`__
 
 TorchCodec can use Nvidia hardware to speed-up video decoding. This is called "CUDA Decoding".
-CUDA Decoding can be faster than CPU Decoding for the actual decoding step and for
+CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
 subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
 the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before
 running the transform steps. Encoded packets are often much smaller than decoded frames so

From 30d9be7d57e46de9b22841916422ae1c215f5e9c Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 07:47:36 -0800
Subject: [PATCH 54/68] .

---
 examples/basic_cuda_example.py | 82 ++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 0915f377..d3c9fa76 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -4,14 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 """
-Accelerated video decoding with NVDEC
+CUDA Decoding on Nvidia GPUs
 =====================================
 
-.. _nvdec_tutorial:
+.. _ndecoderec_tutorial:
 
-**Author**: `Ahmad Sharif <ahmads@meta.com>`__
-
-TorchCodec can use Nvidia hardware to speed-up video decoding. This is called "CUDA Decoding".
+TorchCodec can use supported Nvidia hardware (see support matrix here
+<https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>) to speed-up
+video decoding. This is called "CUDA Decoding".
 CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
 subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
 the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before
@@ -30,34 +30,34 @@
 Here are situations where CUDA Decoding may not make sense:
 
 #. You want bit-exact results compared to CPU Decoding
-#. If you have small resolution videos and the PCI-e transfer latency is large
-#. If your GPU is already busy and CPU is not
+#. You have small resolution videos and the PCI-e transfer latency is large
+#. Your GPU is already busy and CPU is not
 
 It's best to experiment with CUDA Decoding to see if it improves your use-case. With
 TorchCodec you can simply pass in a device parameter to the
 :class:`~torchcodec.decoders.VideoDecoder` class to use CUDA Decoding.
 
 
-In order use CUDA Decoding will need the following installed in your environment:
+In order to use CUDA Decoding will need the following installed in your environment:
 
-#. CUDA-enabled pytorch
-#. FFmpeg binaries that support NVDEC-enabled codecs
+#. An Nvidia GPU that supports decoding the video format you want to decode. See
+   the support matrix here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>
+#. `CUDA-enabled pytorch <https://pytorch.org/get-started/locally/>`_
+#. FFmpeg binaries that support NdecoderEC-enabled codecs
 #. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit)
 
 
-FFmpeg versions 5, 6 and 7 from conda-forge are built with NVDEC support and you can
+FFmpeg versions 5, 6 and 7 from conda-forge are built with NdecoderEC support and you can
 install them with conda. For example, to install FFmpeg version 7:
 
 .. code-block:: bash
-   # These libraries are needed for CUDA decoding with TorchCodec
    conda install ffmpeg=7 -c conda-forge
    conda install libnpp cuda-nvrtc -c nvidia
 """
 
 # %%
-######################################################################
 # Checking if Pytorch has CUDA enabled
-######################################################################
+# -------------------------------------
 #
 # .. note::
 #
@@ -72,9 +72,8 @@
 
 
 # %%
-######################################################################
 # Downloading the video
-######################################################################
+# -------------------------------------
 #
 # We will use the following video which has the following properties;
 #
@@ -98,16 +97,15 @@
 
 
 # %%
-######################################################################
 # CUDA Decoding using VideoDecoder
-######################################################################
+# -------------------------------------
 #
 # To use CUDA decoder, you need to pass in a cuda device to the decoder.
 #
 from torchcodec.decoders import VideoDecoder
 
-vd = VideoDecoder(video_file, device="cuda:0")
-frame = vd[0]
+decoder = VideoDecoder(video_file, device="cuda")
+frame = decoder[0]
 
 # %%
 #
@@ -123,13 +121,11 @@
 
 
 # %%
-######################################################################
 # Visualizing Frames
-######################################################################
+# -------------------------------------
 #
 # Let's look at the frames decoded by CUDA decoder and compare them
 # against equivalent results from the CPU decoders.
-import matplotlib.pyplot as plt
 
 
 def get_frames(timestamps: list[float], device: str):
@@ -148,25 +144,43 @@ def get_numpy_images(frames):
 
 timestamps = [12, 19, 45, 131, 180]
 cpu_frames = get_frames(timestamps, device="cpu")
-cuda_frames = get_frames(timestamps, device="cuda:0")
+cuda_frames = get_frames(timestamps, device="cuda")
 cpu_numpy_images = get_numpy_images(cpu_frames)
 cuda_numpy_images = get_numpy_images(cuda_frames)
 
 
-def plot_cpu_and_cuda_images():
-    n_rows = len(timestamps)
-    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
-    for i in range(n_rows):
-        axes[i][0].imshow(cpu_numpy_images[i])
-        axes[i][1].imshow(cuda_numpy_images[i])
+def plot(
+    frames1: List[torch.Tensor],
+    frames2: List[torch.Tensor],
+    title1: Optional[str] = None,
+    title2: Optional[str] = None,
+):
+    try:
+        import matplotlib.pyplot as plt
+        from torchvision.transforms.v2.functional import to_pil_image
+        from torchvision.utils import make_grid
+    except ImportError:
+        print("Cannot plot, please run `pip install torchvision matplotlib`")
+        return
+
+    plt.rcParams["savefig.bbox"] = "tight"
+
+    fig, ax = plt.subplots(1, 2)
+
+    ax[0].imshow(to_pil_image(make_grid(frames1)))
+    ax[0].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    if title1 is not None:
+        ax[0].set_title(title1)
+
+    ax[1].imshow(to_pil_image(make_grid(frames2)))
+    ax[1].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    if title2 is not None:
+        ax[1].set_title(title2)
 
-    axes[0][0].set_title("CPU decoder")
-    axes[0][1].set_title("CUDA decoder")
-    plt.setp(axes, xticks=[], yticks=[])
     plt.tight_layout()
 
 
-plot_cpu_and_cuda_images()
+plot(cpu_frames, cuda_frames, "CPU decoder", "CUDA decoder")
 
 # %%
 #

From c91e73cada98f31a1d3fd7ecca8a7b8d8941d542 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 07:59:30 -0800
Subject: [PATCH 55/68] .

---
 examples/basic_cuda_example.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index d3c9fa76..12ce9b79 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -126,6 +126,7 @@
 #
 # Let's look at the frames decoded by CUDA decoder and compare them
 # against equivalent results from the CPU decoders.
+from typing import List, Optional
 
 
 def get_frames(timestamps: list[float], device: str):

From 0f502105213ac70624918341b6d6712b04bc4ef8 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 09:01:59 -0800
Subject: [PATCH 56/68] .

---
 examples/basic_cuda_example.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 12ce9b79..f0af8742 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -50,9 +50,13 @@
 FFmpeg versions 5, 6 and 7 from conda-forge are built with NdecoderEC support and you can
 install them with conda. For example, to install FFmpeg version 7:
 
+
 .. code-block:: bash
-   conda install ffmpeg=7 -c conda-forge
-   conda install libnpp cuda-nvrtc -c nvidia
+
+    conda install ffmpeg=7 -c conda-forge
+    conda install libnpp cuda-nvrtc -c nvidia
+
+
 """
 
 # %%
@@ -146,6 +150,8 @@ def get_numpy_images(frames):
 timestamps = [12, 19, 45, 131, 180]
 cpu_frames = get_frames(timestamps, device="cpu")
 cuda_frames = get_frames(timestamps, device="cuda")
+cpu_tensors = [frame.data for frame in cpu_frames]
+cuda_tensors = [frame.data for frame in cuda_frames]
 cpu_numpy_images = get_numpy_images(cpu_frames)
 cuda_numpy_images = get_numpy_images(cuda_frames)
 
@@ -168,12 +174,12 @@ def plot(
 
     fig, ax = plt.subplots(1, 2)
 
-    ax[0].imshow(to_pil_image(make_grid(frames1)))
+    ax[0].imshow(to_pil_image(make_grid(frames1, nrow=1)))
     ax[0].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
     if title1 is not None:
         ax[0].set_title(title1)
 
-    ax[1].imshow(to_pil_image(make_grid(frames2)))
+    ax[1].imshow(to_pil_image(make_grid(frames2, nrow=1)))
     ax[1].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
     if title2 is not None:
         ax[1].set_title(title2)
@@ -181,7 +187,7 @@ def plot(
     plt.tight_layout()
 
 
-plot(cpu_frames, cuda_frames, "CPU decoder", "CUDA decoder")
+plot(cpu_tensors, cuda_tensors, "CPU decoder", "CUDA decoder")
 
 # %%
 #

From f8d5e691559f5b9a592e1b71caa579ae9976c6a6 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 09:28:18 -0800
Subject: [PATCH 57/68] .

---
 examples/basic_cuda_example.py | 44 ++++++++++------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index f0af8742..95848478 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -130,7 +130,7 @@
 #
 # Let's look at the frames decoded by CUDA decoder and compare them
 # against equivalent results from the CPU decoders.
-from typing import List, Optional
+import matplotlib.pyplot as plt
 
 
 def get_frames(timestamps: list[float], device: str):
@@ -149,45 +149,25 @@ def get_numpy_images(frames):
 
 timestamps = [12, 19, 45, 131, 180]
 cpu_frames = get_frames(timestamps, device="cpu")
-cuda_frames = get_frames(timestamps, device="cuda")
-cpu_tensors = [frame.data for frame in cpu_frames]
-cuda_tensors = [frame.data for frame in cuda_frames]
+cuda_frames = get_frames(timestamps, device="cuda:0")
 cpu_numpy_images = get_numpy_images(cpu_frames)
 cuda_numpy_images = get_numpy_images(cuda_frames)
 
 
-def plot(
-    frames1: List[torch.Tensor],
-    frames2: List[torch.Tensor],
-    title1: Optional[str] = None,
-    title2: Optional[str] = None,
-):
-    try:
-        import matplotlib.pyplot as plt
-        from torchvision.transforms.v2.functional import to_pil_image
-        from torchvision.utils import make_grid
-    except ImportError:
-        print("Cannot plot, please run `pip install torchvision matplotlib`")
-        return
-
-    plt.rcParams["savefig.bbox"] = "tight"
-
-    fig, ax = plt.subplots(1, 2)
-
-    ax[0].imshow(to_pil_image(make_grid(frames1, nrow=1)))
-    ax[0].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-    if title1 is not None:
-        ax[0].set_title(title1)
-
-    ax[1].imshow(to_pil_image(make_grid(frames2, nrow=1)))
-    ax[1].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-    if title2 is not None:
-        ax[1].set_title(title2)
+def plot_cpu_and_cuda_images():
+    n_rows = len(timestamps)
+    fig, axes = plt.subplots(n_rows, 2, figsize=[12.8, 16.0])
+    for i in range(n_rows):
+        axes[i][0].imshow(cpu_numpy_images[i])
+        axes[i][1].imshow(cuda_numpy_images[i])
 
+    axes[0][0].set_title("CPU decoder")
+    axes[0][1].set_title("CUDA decoder")
+    plt.setp(axes, xticks=[], yticks=[])
     plt.tight_layout()
 
 
-plot(cpu_tensors, cuda_tensors, "CPU decoder", "CUDA decoder")
+plot_cpu_and_cuda_images()
 
 # %%
 #

From 5a4291af45485ea775e1a4e3084a633d7e1485b4 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 13:21:44 -0800
Subject: [PATCH 58/68] .

---
 .github/workflows/docs.yaml | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 4a98fe8f..d8a6499c 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -1,9 +1,10 @@
 name: Docs
 
 on:
-  push:
-    branches: [ main ]
-  pull_request:
+  workflow_run:
+    workflows: ["Build and test Linux CUDA wheels"]
+    types:
+      - completed
 
 defaults:
   run:
@@ -26,9 +27,11 @@ jobs:
           python-version: '3.12'
       - name: Update pip
         run: python -m pip install --upgrade pip
-      - name: Install torchcodec from nightly
+      - name: Install torchcodec from the wheel
         run: |
-          pip3 install --pre torch torchvision torchaudio torchcodec --index-url https://download.pytorch.org/whl/nightly/cu124
+          wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
+          echo Installing $wheel_path
+          ${CONDA_RUN} python -m pip install $wheel_path -vvv
       - name: Install FFMPEG and other deps
         run: |
           conda install cuda-nvrtc=12.4 libnpp -c nvidia

From af3f684bef5ced7bcd22346011879ca4a61c2461 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 13:32:32 -0800
Subject: [PATCH 59/68] .

---
 examples/basic_cuda_example.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 95848478..4936f2d8 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -4,14 +4,15 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 """
-CUDA Decoding on Nvidia GPUs
-=====================================
+Accelerated video decoding on GPUs with CUDA and NVDEC
+================================================================
 
 .. _ndecoderec_tutorial:
 
 TorchCodec can use supported Nvidia hardware (see support matrix here
 <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>) to speed-up
-video decoding. This is called "CUDA Decoding".
+video decoding. This is called "CUDA Decoding" and it uses Nvidia's NVDEC hardware
+decoder and CUDA kernels to respectively decompress and convert to RGB.
 CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
 subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
 the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before

From 891125bcba26fd40575e51641b574ce3b148e007 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 13:35:06 -0800
Subject: [PATCH 60/68] .

---
 examples/basic_cuda_example.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 4936f2d8..4679251f 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -9,10 +9,11 @@
 
 .. _ndecoderec_tutorial:
 
-TorchCodec can use supported Nvidia hardware (see support matrix here
-<https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>) to speed-up
-video decoding. This is called "CUDA Decoding" and it uses Nvidia's NVDEC hardware
-decoder and CUDA kernels to respectively decompress and convert to RGB.
+TorchCodec can use supported Nvidia hardware (see support matrix
+`here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`) to speed-up
+video decoding. This is called "CUDA Decoding" and it uses Nvidia's
+`NVDEC hardware decoder <https://developer.nvidia.com/video-codec-sdk>`
+and CUDA kernels to respectively decompress and convert to RGB.
 CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
 subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
 the decoded tensor in GPU memory so the GPU doesn't have to fetch from main memory before

From 9809feb58a77923f6456d80ad7fe3de9f047ea49 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 13:38:18 -0800
Subject: [PATCH 61/68] .

---
 examples/basic_cuda_example.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/basic_cuda_example.py b/examples/basic_cuda_example.py
index 4679251f..f1bd103a 100644
--- a/examples/basic_cuda_example.py
+++ b/examples/basic_cuda_example.py
@@ -10,9 +10,9 @@
 .. _ndecoderec_tutorial:
 
 TorchCodec can use supported Nvidia hardware (see support matrix
-`here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`) to speed-up
+`here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_) to speed-up
 video decoding. This is called "CUDA Decoding" and it uses Nvidia's
-`NVDEC hardware decoder <https://developer.nvidia.com/video-codec-sdk>`
+`NVDEC hardware decoder <https://developer.nvidia.com/video-codec-sdk>`_
 and CUDA kernels to respectively decompress and convert to RGB.
 CUDA Decoding can be faster than CPU Decoding for the actual decoding step and also for
 subsequent transform steps like scaling, cropping or rotating. This is because the decode step leaves
@@ -43,7 +43,7 @@
 In order to use CUDA Decoding will need the following installed in your environment:
 
 #. An Nvidia GPU that supports decoding the video format you want to decode. See
-   the support matrix here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>
+   the support matrix `here <https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new>`_
 #. `CUDA-enabled pytorch <https://pytorch.org/get-started/locally/>`_
 #. FFmpeg binaries that support NdecoderEC-enabled codecs
 #. libnpp and nvrtc (these are usually installed when you install the full cuda-toolkit)

From 92e2aef12ff6581da7d35dffd3ed48f130a7e260 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 13:52:05 -0800
Subject: [PATCH 62/68] .

---
 .github/workflows/docs.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index d8a6499c..8b57adee 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -24,9 +24,14 @@ jobs:
           auto-update-conda: true
           miniconda-version: "latest"
           activate-environment: test
-          python-version: '3.12'
+          python-version: '3.9'
       - name: Update pip
         run: python -m pip install --upgrade pip
+      - name: Download wheel
+        uses: actions/download-artifact@v3
+        with:
+          name: pytorch_torchcodec__3.9_cu124_x86_64
+          path: pytorch/torchcodec/dist/
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`

From 893c49081968d1d8317533cd0fe7554ba0643b1a Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 14:14:50 -0800
Subject: [PATCH 63/68] .

---
 .github/workflows/docs.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 8b57adee..5bc833d7 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -2,7 +2,7 @@ name: Docs
 
 on:
   workflow_run:
-    workflows: ["Build and test Linux CUDA wheels"]
+    workflows: [Build and test Linux CUDA wheels]
     types:
       - completed
 

From 2a106ca871e3428eec49a792f19a167980e1dcaf Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 18:11:29 -0800
Subject: [PATCH 64/68] .

---
 .github/workflows/docs.yaml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index 5bc833d7..ad50ba1d 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -1,10 +1,9 @@
 name: Docs
 
 on:
-  workflow_run:
-    workflows: [Build and test Linux CUDA wheels]
-    types:
-      - completed
+  push:
+    branches: [ main ]
+  pull_request:
 
 defaults:
   run:

From 39f460689b9aec5496f424f393df3eea8c667532 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Fri, 8 Nov 2024 18:26:35 -0800
Subject: [PATCH 65/68] .

---
 .github/workflows/docs.yaml             | 99 ++++++++++++++++++++-----
 .github/workflows/linux_cuda_wheel.yaml |  2 +-
 2 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index ad50ba1d..c2e682c8 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -10,37 +10,96 @@ defaults:
     shell: bash -l -eo pipefail {0}
 
 jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      with-cpu: disable
+      with-xpu: disable
+      with-rocm: disable
+      with-cuda: enable
+      build-python-only: "disable"
   build:
-    runs-on: linux.g5.4xlarge.nvidia.gpu
+    needs: generate-matrix
     strategy:
       fail-fast: false
+    name: Build and Upload wheel
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      repository: pytorch/torchcodec
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/fake_smoke_test.py
+      package-name: torchcodec
+      trigger-event: ${{ github.event_name }}
+      build-platform: "python-build-package"
+      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"
+
+  build-docs:
+    runs-on: linux.4xlarge.nvidia.gpu
+    strategy:
+      fail-fast: false
+      matrix:
+          # 3.9 corresponds to the minimum python version for which we build
+          # the wheel unless the label cliflow/binaries/all is present in the
+          # PR.
+          # For the actual release we should add that label and change this to
+          # include more python versions.
+        python-version: ['3.9']
+        cuda-version: ['12.4']
+        ffmpeg-version-for-tests: ['7']
+    container:
+      image: "pytorch/manylinux-builder:cuda${{ matrix.cuda-version }}"
+      options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"
+    needs: build
     steps:
-      - name: Check out repo
-        uses: actions/checkout@v3
-      - name: Setup conda env
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          miniconda-version: "latest"
-          activate-environment: test
-          python-version: '3.9'
-      - name: Update pip
-        run: python -m pip install --upgrade pip
-      - name: Download wheel
-        uses: actions/download-artifact@v3
+      - name: Setup env vars
+        run: |
+          cuda_version_without_periods=$(echo "${{ matrix.cuda-version }}" | sed 's/\.//g')
+          echo cuda_version_without_periods=${cuda_version_without_periods} >> $GITHUB_ENV
+      - uses: actions/download-artifact@v3
         with:
-          name: pytorch_torchcodec__3.9_cu124_x86_64
+          name: pytorch_torchcodec__3.9_cu${{ env.cuda_version_without_periods }}_x86_64
           path: pytorch/torchcodec/dist/
+      - name: Setup miniconda using test-infra
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: ${{ matrix.python-version }}
+          #
+          # For some reason nvidia::libnpp=12.4 doesn't install but nvidia/label/cuda-12.4.0::libnpp does.
+          # So we use the latter convention for libnpp.
+          # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
+          default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"
+      - name: Check env
+        run: |
+          ${CONDA_RUN} env
+          ${CONDA_RUN} conda info
+          ${CONDA_RUN} nvidia-smi
+          ${CONDA_RUN} conda list
+      - name: Assert ffmpeg exists
+        run: |
+          ${CONDA_RUN} ffmpeg -buildconf
+      - name: Update pip
+        run: ${CONDA_RUN} python -m pip install --upgrade pip
+      - name: Install PyTorch
+        run: |
+          ${CONDA_RUN} python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu${{ env.cuda_version_without_periods }}
+          ${CONDA_RUN} python -c 'import torch; print(f"{torch.__version__}"); print(f"{torch.__file__}"); print(f"{torch.cuda.is_available()=}")'
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
           echo Installing $wheel_path
           ${CONDA_RUN} python -m pip install $wheel_path -vvv
-      - name: Install FFMPEG and other deps
-        run: |
-          conda install cuda-nvrtc=12.4 libnpp -c nvidia
-          conda install ffmpeg=7 -c conda-forge
-          ffmpeg -version
+
+      - name: Check out repo
+        uses: actions/checkout@v3
+
       - name: Install doc dependencies
         run: |
           cd docs
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 915c5236..3f4ec262 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -1,4 +1,4 @@
-name: Build and test Linux CUDA wheels
+name: linux_cuda_wheel
 
 on:
   pull_request:

From a51dfbd3dcdc17022a6f8cb6e3bf61b87861dcd1 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Sun, 10 Nov 2024 15:46:16 -0800
Subject: [PATCH 66/68] .

---
 .github/workflows/docs.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index c2e682c8..bed896b7 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -5,6 +5,10 @@ on:
     branches: [ main ]
   pull_request:
 
+permissions:
+  id-token: write
+  contents: write
+
 defaults:
   run:
     shell: bash -l -eo pipefail {0}

From f29b05cf6ee13d7ae3115d543633524fca673bd6 Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Sun, 10 Nov 2024 15:48:39 -0800
Subject: [PATCH 67/68] .

---
 .github/workflows/linux_cuda_wheel.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 3f4ec262..915c5236 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -1,4 +1,4 @@
-name: linux_cuda_wheel
+name: Build and test Linux CUDA wheels
 
 on:
   pull_request:

From 3f85afa6cfc5f5273294dfacfcd4f595aaa1801d Mon Sep 17 00:00:00 2001
From: Ahmad Sharif <ahmads@fb.com>
Date: Sun, 10 Nov 2024 15:57:43 -0800
Subject: [PATCH 68/68] .

---
 .github/workflows/docs.yaml             | 110 ++++++------------------
 .github/workflows/linux_cuda_wheel.yaml |   2 +-
 2 files changed, 25 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index bed896b7..0bafb718 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -1,109 +1,47 @@
 name: Docs
 
 on:
-  push:
-    branches: [ main ]
-  pull_request:
-
-permissions:
-  id-token: write
-  contents: write
+  workflow_run:
+    workflows: [linux_cuda_wheels]
+    types:
+      - completed
 
 defaults:
   run:
     shell: bash -l -eo pipefail {0}
 
 jobs:
-  generate-matrix:
-    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
-    with:
-      package-type: wheel
-      os: linux
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      with-cpu: disable
-      with-xpu: disable
-      with-rocm: disable
-      with-cuda: enable
-      build-python-only: "disable"
   build:
-    needs: generate-matrix
+    runs-on: linux.g5.4xlarge.nvidia.gpu
     strategy:
       fail-fast: false
-    name: Build and Upload wheel
-    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
-    with:
-      repository: pytorch/torchcodec
-      ref: ""
-      test-infra-repository: pytorch/test-infra
-      test-infra-ref: main
-      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
-      post-script: packaging/post_build_script.sh
-      smoke-test-script: packaging/fake_smoke_test.py
-      package-name: torchcodec
-      trigger-event: ${{ github.event_name }}
-      build-platform: "python-build-package"
-      build-command: "BUILD_AGAINST_ALL_FFMPEG_FROM_S3=1 ENABLE_CUDA=1 python -m build --wheel -vvv --no-isolation"
-
-  build-docs:
-    runs-on: linux.4xlarge.nvidia.gpu
-    strategy:
-      fail-fast: false
-      matrix:
-          # 3.9 corresponds to the minimum python version for which we build
-          # the wheel unless the label cliflow/binaries/all is present in the
-          # PR.
-          # For the actual release we should add that label and change this to
-          # include more python versions.
-        python-version: ['3.9']
-        cuda-version: ['12.4']
-        ffmpeg-version-for-tests: ['7']
-    container:
-      image: "pytorch/manylinux-builder:cuda${{ matrix.cuda-version }}"
-      options: "--gpus all -e NVIDIA_DRIVER_CAPABILITIES=video,compute,utility"
-    needs: build
     steps:
-      - name: Setup env vars
-        run: |
-          cuda_version_without_periods=$(echo "${{ matrix.cuda-version }}" | sed 's/\.//g')
-          echo cuda_version_without_periods=${cuda_version_without_periods} >> $GITHUB_ENV
-      - uses: actions/download-artifact@v3
-        with:
-          name: pytorch_torchcodec__3.9_cu${{ env.cuda_version_without_periods }}_x86_64
-          path: pytorch/torchcodec/dist/
-      - name: Setup miniconda using test-infra
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
         with:
-          python-version: ${{ matrix.python-version }}
-          #
-          # For some reason nvidia::libnpp=12.4 doesn't install but nvidia/label/cuda-12.4.0::libnpp does.
-          # So we use the latter convention for libnpp.
-          # We install conda packages at the start because otherwise conda may have conflicts with dependencies.
-          default-packages: "nvidia/label/cuda-${{ matrix.cuda-version }}.0::libnpp nvidia::cuda-nvrtc=${{ matrix.cuda-version }} nvidia::cuda-toolkit=${{ matrix.cuda-version }} nvidia::cuda-cudart=${{ matrix.cuda-version }} nvidia::cuda-driver-dev=${{ matrix.cuda-version }} conda-forge::ffmpeg=${{ matrix.ffmpeg-version-for-tests }}"
-      - name: Check env
-        run: |
-          ${CONDA_RUN} env
-          ${CONDA_RUN} conda info
-          ${CONDA_RUN} nvidia-smi
-          ${CONDA_RUN} conda list
-      - name: Assert ffmpeg exists
-        run: |
-          ${CONDA_RUN} ffmpeg -buildconf
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: '3.9'
       - name: Update pip
-        run: ${CONDA_RUN} python -m pip install --upgrade pip
-      - name: Install PyTorch
-        run: |
-          ${CONDA_RUN} python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu${{ env.cuda_version_without_periods }}
-          ${CONDA_RUN} python -c 'import torch; print(f"{torch.__version__}"); print(f"{torch.__file__}"); print(f"{torch.cuda.is_available()=}")'
+        run: python -m pip install --upgrade pip
+      - name: Download wheel
+        uses: actions/download-artifact@v3
+        with:
+          name: pytorch_torchcodec__3.9_cu124_x86_64
+          path: pytorch/torchcodec/dist/
       - name: Install torchcodec from the wheel
         run: |
           wheel_path=`find pytorch/torchcodec/dist -type f -name "*.whl"`
           echo Installing $wheel_path
           ${CONDA_RUN} python -m pip install $wheel_path -vvv
-
-      - name: Check out repo
-        uses: actions/checkout@v3
-
+      - name: Install FFMPEG and other deps
+        run: |
+          conda install cuda-nvrtc=12.4 libnpp -c nvidia
+          conda install ffmpeg=7 -c conda-forge
+          ffmpeg -version
       - name: Install doc dependencies
         run: |
           cd docs
diff --git a/.github/workflows/linux_cuda_wheel.yaml b/.github/workflows/linux_cuda_wheel.yaml
index 915c5236..17272a24 100644
--- a/.github/workflows/linux_cuda_wheel.yaml
+++ b/.github/workflows/linux_cuda_wheel.yaml
@@ -1,4 +1,4 @@
-name: Build and test Linux CUDA wheels
+name: linux_cuda_wheels
 
 on:
   pull_request: