diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py
index 34292751..46cee62b 100644
--- a/src/torchcodec/decoders/_audio_decoder.py
+++ b/src/torchcodec/decoders/_audio_decoder.py
@@ -25,10 +25,13 @@ def __init__(
         source: Union[str, Path, bytes, Tensor],
         *,
         stream_index: Optional[int] = None,
+        sample_rate: Optional[int] = None,
     ):
         self._decoder = create_decoder(source=source, seek_mode="approximate")
 
-        core.add_audio_stream(self._decoder, stream_index=stream_index)
+        core.add_audio_stream(
+            self._decoder, stream_index=stream_index, sample_rate=sample_rate
+        )
 
         (
             self.metadata,
@@ -39,6 +42,9 @@ def __init__(
             decoder=self._decoder, stream_index=stream_index, media_type="audio"
         )
         assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
+        self._desired_sample_rate = (
+            sample_rate if sample_rate is not None else self.metadata.sample_rate
+        )
 
     def get_samples_played_in_range(
         self, start_seconds: float, stop_seconds: Optional[float] = None
@@ -75,11 +81,7 @@ def get_samples_played_in_range(
         # So we do some basic math to figure out the position of the view that
         # we'll return.
 
-        # TODO: sample_rate is either the original one from metadata, or the
-        # user-specified one (NIY)
-        assert isinstance(self.metadata, core.AudioStreamMetadata)  # mypy
-        sample_rate = self.metadata.sample_rate
-
+        sample_rate = self._desired_sample_rate
         # TODO: metadata's sample_rate should probably not be Optional
         assert sample_rate is not None  # mypy.
 
@@ -94,7 +96,7 @@ def get_samples_played_in_range(
             output_pts_seconds = first_pts
 
         num_samples = frames.shape[1]
-        last_pts = first_pts + num_samples / self.metadata.sample_rate
+        last_pts = first_pts + num_samples / sample_rate
         if stop_seconds is not None and stop_seconds < last_pts:
             offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
         else:
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
index eb82c5a2..1e3a1421 100644
--- a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
+++ b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp
@@ -86,9 +86,10 @@ void setChannelLayout(
 
 SwrContext* allocateSwrContext(
     UniqueAVCodecContext& avCodecContext,
-    int sampleRate,
     AVSampleFormat sourceSampleFormat,
-    AVSampleFormat desiredSampleFormat) {
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate) {
   SwrContext* swrContext = nullptr;
 #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
   AVChannelLayout layout = avCodecContext->ch_layout;
@@ -96,10 +97,10 @@ SwrContext* allocateSwrContext(
       &swrContext,
       &layout,
       desiredSampleFormat,
-      sampleRate,
+      desiredSampleRate,
       &layout,
       sourceSampleFormat,
-      sampleRate,
+      sourceSampleRate,
       0,
       nullptr);
 
@@ -113,10 +114,10 @@ SwrContext* allocateSwrContext(
       nullptr,
       layout,
       desiredSampleFormat,
-      sampleRate,
+      desiredSampleRate,
       layout,
       sourceSampleFormat,
-      sampleRate,
+      sourceSampleRate,
       0,
       nullptr);
 #endif
diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h
index 382563aa..c61ae287 100644
--- a/src/torchcodec/decoders/_core/FFMPEGCommon.h
+++ b/src/torchcodec/decoders/_core/FFMPEGCommon.h
@@ -149,9 +149,10 @@ void setChannelLayout(
     const UniqueAVFrame& srcAVFrame);
 SwrContext* allocateSwrContext(
     UniqueAVCodecContext& avCodecContext,
-    int sampleRate,
     AVSampleFormat sourceSampleFormat,
-    AVSampleFormat desiredSampleFormat);
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate);
 
 // Returns true if sws_scale can handle unaligned data.
 bool canSwsScaleHandleUnalignedData();
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
index d4d32058..f451ee58 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -580,7 +580,9 @@ void VideoDecoder::addVideoStream(
       videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
 }
 
-void VideoDecoder::addAudioStream(int streamIndex) {
+void VideoDecoder::addAudioStream(
+    int streamIndex,
+    const AudioStreamOptions& audioStreamOptions) {
   TORCH_CHECK(
       seekMode_ == SeekMode::approximate,
       "seek_mode must be 'approximate' for audio streams.");
@@ -588,6 +590,8 @@ void VideoDecoder::addAudioStream(int streamIndex) {
   addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
 
   auto& streamInfo = streamInfos_[activeStreamIndex_];
+  streamInfo.audioStreamOptions = audioStreamOptions;
+
   auto& streamMetadata =
       containerMetadata_.allStreamMetadata[activeStreamIndex_];
   streamMetadata.sampleRate =
@@ -947,6 +951,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio(
         (stopPts <= lastDecodedAvFrameEnd);
   }
 
+  auto lastSamples = maybeFlushSwrBuffers();
+  if (lastSamples.has_value()) {
+    frames.push_back(*lastSamples);
+  }
+
   return AudioFramesOutput{torch::cat(frames, 1), firstFramePtsSeconds};
 }
 
@@ -1200,8 +1209,7 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput(
       getDuration(avFrame),
       formatContext_->streams[activeStreamIndex_]->time_base);
   if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
-    convertAudioAVFrameToFrameOutputOnCPU(
-        avFrame, frameOutput, preAllocatedOutputTensor);
+    convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
   } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) {
     convertAVFrameToFrameOutputOnCPU(
         avFrame, frameOutput, preAllocatedOutputTensor);
@@ -1379,24 +1387,30 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph(
 
 void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
     UniqueAVFrame& srcAVFrame,
-    FrameOutput& frameOutput,
-    std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  TORCH_CHECK(
-      !preAllocatedOutputTensor.has_value(),
-      "pre-allocated audio tensor not supported yet.");
-
+    FrameOutput& frameOutput) {
   AVSampleFormat sourceSampleFormat =
       static_cast<AVSampleFormat>(srcAVFrame->format);
   AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
 
+  int sourceSampleRate = srcAVFrame->sample_rate;
+  int desiredSampleRate =
+      streamInfos_[activeStreamIndex_].audioStreamOptions.sampleRate.value_or(
+          sourceSampleRate);
+
+  bool mustConvert =
+      (sourceSampleFormat != desiredSampleFormat ||
+       sourceSampleRate != desiredSampleRate);
+
   UniqueAVFrame convertedAVFrame;
-  if (sourceSampleFormat != desiredSampleFormat) {
-    convertedAVFrame = convertAudioAVFrameSampleFormat(
-        srcAVFrame, sourceSampleFormat, desiredSampleFormat);
+  if (mustConvert) {
+    convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
+        srcAVFrame,
+        sourceSampleFormat,
+        desiredSampleFormat,
+        sourceSampleRate,
+        desiredSampleRate);
   }
-  const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat)
-      ? convertedAVFrame
-      : srcAVFrame;
+  const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
 
   AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
   TORCH_CHECK(
@@ -1419,23 +1433,25 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU(
     memcpy(
         outputChannelData, avFrame->extended_data[channel], numBytesPerChannel);
   }
+
   frameOutput.data = outputData;
 }
 
-UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
-    const UniqueAVFrame& avFrame,
+UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate(
+    const UniqueAVFrame& srcAVFrame,
     AVSampleFormat sourceSampleFormat,
-    AVSampleFormat desiredSampleFormat
-
-) {
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate) {
   auto& streamInfo = streamInfos_[activeStreamIndex_];
-  const auto& streamMetadata =
-      containerMetadata_.allStreamMetadata[activeStreamIndex_];
-  int sampleRate = static_cast<int>(streamMetadata.sampleRate.value());
 
   if (!streamInfo.swrContext) {
     createSwrContext(
-        streamInfo, sampleRate, sourceSampleFormat, desiredSampleFormat);
+        streamInfo,
+        sourceSampleFormat,
+        desiredSampleFormat,
+        sourceSampleRate,
+        desiredSampleRate);
   }
 
   UniqueAVFrame convertedAVFrame(av_frame_alloc());
@@ -1443,10 +1459,27 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
       convertedAVFrame,
       "Could not allocate frame for sample format conversion.");
 
-  setChannelLayout(convertedAVFrame, avFrame);
+  setChannelLayout(convertedAVFrame, srcAVFrame);
   convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
-  convertedAVFrame->sample_rate = avFrame->sample_rate;
-  convertedAVFrame->nb_samples = avFrame->nb_samples;
+  convertedAVFrame->sample_rate = desiredSampleRate;
+  if (sourceSampleRate != desiredSampleRate) {
+    // Note that this is an upper bound on the number of output samples.
+    // `swr_convert()` will likely not fill convertedAVFrame with that many
+    // samples if sample rate conversion is needed. It will buffer the last few
+    // ones because those require future samples. That's also why we reset
+    // nb_samples after the call to `swr_convert()`.
+    // We could also use `swr_get_out_samples()` to determine the number of
+    // output samples, but empirically `av_rescale_rnd()` seems to provide a
+    // tighter bound.
+    convertedAVFrame->nb_samples = av_rescale_rnd(
+        swr_get_delay(streamInfo.swrContext.get(), sourceSampleRate) +
+            srcAVFrame->nb_samples,
+        desiredSampleRate,
+        sourceSampleRate,
+        AV_ROUND_UP);
+  } else {
+    convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
+  }
 
   auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
   TORCH_CHECK(
@@ -1454,20 +1487,56 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat(
       "Could not allocate frame buffers for sample format conversion: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  auto numSampleConverted = swr_convert(
+  auto numConvertedSamples = swr_convert(
       streamInfo.swrContext.get(),
       convertedAVFrame->data,
       convertedAVFrame->nb_samples,
-      static_cast<const uint8_t**>(const_cast<const uint8_t**>(avFrame->data)),
-      avFrame->nb_samples);
+      static_cast<const uint8_t**>(
+          const_cast<const uint8_t**>(srcAVFrame->data)),
+      srcAVFrame->nb_samples);
   TORCH_CHECK(
-      numSampleConverted > 0,
+      numConvertedSamples > 0,
       "Error in swr_convert: ",
-      getFFMPEGErrorStringFromErrorCode(numSampleConverted));
+      getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
+
+  // See comment above about nb_samples
+  convertedAVFrame->nb_samples = numConvertedSamples;
 
   return convertedAVFrame;
 }
 
+std::optional<torch::Tensor> VideoDecoder::maybeFlushSwrBuffers() {
+  // When sample rate conversion is involved, swresample buffers some of the
+  // samples in-between calls to swr_convert (see the libswresample docs).
+  // That's because the last few samples in a given frame require future samples
+  // from the next frame to be properly converted. This function flushes out the
+  // samples that are stored in swresample's buffers.
+  auto& streamInfo = streamInfos_[activeStreamIndex_];
+  if (!streamInfo.swrContext) {
+    return std::nullopt;
+  }
+  auto numRemainingSamples = // this is an upper bound
+      swr_get_out_samples(streamInfo.swrContext.get(), 0);
+
+  if (numRemainingSamples == 0) {
+    return std::nullopt;
+  }
+
+  torch::Tensor lastSamples = torch::empty(
+      {getNumChannels(streamInfo.codecContext), numRemainingSamples},
+      torch::kFloat32);
+  uint8_t* lastSamplesData = static_cast<uint8_t*>(lastSamples.data_ptr());
+
+  auto actualNumRemainingSamples = swr_convert(
+      streamInfo.swrContext.get(),
+      &lastSamplesData,
+      numRemainingSamples,
+      nullptr,
+      0);
+  return lastSamples.narrow(
+      /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
+}
+
 // --------------------------------------------------------------------------
 // OUTPUT ALLOCATION AND SHAPE CONVERSION
 // --------------------------------------------------------------------------
@@ -1703,14 +1772,16 @@ void VideoDecoder::createSwsContext(
 
 void VideoDecoder::createSwrContext(
     StreamInfo& streamInfo,
-    int sampleRate,
     AVSampleFormat sourceSampleFormat,
-    AVSampleFormat desiredSampleFormat) {
+    AVSampleFormat desiredSampleFormat,
+    int sourceSampleRate,
+    int desiredSampleRate) {
   auto swrContext = allocateSwrContext(
       streamInfo.codecContext,
-      sampleRate,
       sourceSampleFormat,
-      desiredSampleFormat);
+      desiredSampleFormat,
+      sourceSampleRate,
+      desiredSampleRate);
 
   auto status = swr_init(swrContext);
   TORCH_CHECK(
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
index 8259a785..bc810952 100644
--- a/src/torchcodec/decoders/_core/VideoDecoder.h
+++ b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -139,10 +139,18 @@ class VideoDecoder {
     torch::Device device = torch::kCPU;
   };
 
+  struct AudioStreamOptions {
+    AudioStreamOptions() {}
+
+    std::optional<int> sampleRate;
+  };
+
   void addVideoStream(
       int streamIndex,
       const VideoStreamOptions& videoStreamOptions = VideoStreamOptions());
-  void addAudioStream(int streamIndex);
+  void addAudioStream(
+      int streamIndex,
+      const AudioStreamOptions& audioStreamOptions = AudioStreamOptions());
 
   // --------------------------------------------------------------------------
   // DECODING AND SEEKING APIs
@@ -336,6 +344,7 @@ class VideoDecoder {
     int64_t lastDecodedAvFramePts = 0;
     int64_t lastDecodedAvFrameDuration = 0;
     VideoStreamOptions videoStreamOptions;
+    AudioStreamOptions audioStreamOptions;
 
     // color-conversion fields. Only one of FilterGraphContext and
     // UniqueSwsContext should be non-null.
@@ -383,8 +392,7 @@ class VideoDecoder {
 
   void convertAudioAVFrameToFrameOutputOnCPU(
       UniqueAVFrame& srcAVFrame,
-      FrameOutput& frameOutput,
-      std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
+      FrameOutput& frameOutput);
 
   torch::Tensor convertAVFrameToTensorUsingFilterGraph(
       const UniqueAVFrame& avFrame);
@@ -393,10 +401,14 @@ class VideoDecoder {
       const UniqueAVFrame& avFrame,
       torch::Tensor& outputTensor);
 
-  UniqueAVFrame convertAudioAVFrameSampleFormat(
-      const UniqueAVFrame& avFrame,
+  UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
+      const UniqueAVFrame& srcAVFrame,
       AVSampleFormat sourceSampleFormat,
-      AVSampleFormat desiredSampleFormat);
+      AVSampleFormat desiredSampleFormat,
+      int sourceSampleRate,
+      int desiredSampleRate);
+
+  std::optional<torch::Tensor> maybeFlushSwrBuffers();
 
   // --------------------------------------------------------------------------
   // COLOR CONVERSION LIBRARIES HANDLERS CREATION
@@ -414,9 +426,10 @@ class VideoDecoder {
 
   void createSwrContext(
       StreamInfo& streamInfo,
-      int sampleRate,
       AVSampleFormat sourceSampleFormat,
-      AVSampleFormat desiredSampleFormat);
+      AVSampleFormat desiredSampleFormat,
+      int sourceSampleRate,
+      int desiredSampleRate);
 
   // --------------------------------------------------------------------------
   // PTS <-> INDEX CONVERSIONS
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
index fffb1118..adbed7ca 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp
@@ -34,7 +34,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()");
   m.def(
-      "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None) -> ()");
+      "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
   m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)");
   m.def(
@@ -220,9 +220,13 @@ void _add_video_stream(
 
 void add_audio_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> stream_index) {
+    std::optional<int64_t> stream_index,
+    std::optional<int64_t> sample_rate) {
+  VideoDecoder::AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.sampleRate = sample_rate;
+
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  videoDecoder->addAudioStream(stream_index.value_or(-1));
+  videoDecoder->addAudioStream(stream_index.value_or(-1), audioStreamOptions);
 }
 
 void seek_to_pts(at::Tensor& decoder, double seconds) {
diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h
index a77dec66..bc7f2036 100644
--- a/src/torchcodec/decoders/_core/VideoDecoderOps.h
+++ b/src/torchcodec/decoders/_core/VideoDecoderOps.h
@@ -50,7 +50,8 @@ void _add_video_stream(
 
 void add_audio_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> stream_index = std::nullopt);
+    std::optional<int64_t> stream_index = std::nullopt,
+    std::optional<int64_t> sample_rate = std::nullopt);
 
 // Seek to a particular presentation timestamp in the video in seconds.
 void seek_to_pts(at::Tensor& decoder, double seconds);
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
index a2d70f51..6885b51a 100644
--- a/test/decoders/test_decoders.py
+++ b/test/decoders/test_decoders.py
@@ -27,6 +27,8 @@
     NASA_VIDEO,
     SINE_MONO_S16,
     SINE_MONO_S32,
+    SINE_MONO_S32_44100,
+    SINE_MONO_S32_8000,
 )
 
 
@@ -1090,6 +1092,71 @@ def test_format_conversion(self):
         reference_frames = asset.get_frame_data_by_range(start=0, stop=asset.num_frames)
         torch.testing.assert_close(all_samples.data, reference_frames)
 
+    @pytest.mark.parametrize(
+        "start_seconds, stop_seconds",
+        (
+            (0, None),
+            (0, 4),
+            (0, 3),
+            (2, None),
+            (2, 3),
+        ),
+    )
+    def test_sample_rate_conversion(self, start_seconds, stop_seconds):
+        # When start_seconds is not exactly 0, we have to increase the tolerance
+        # a bit. This is because sample_rate conversion relies on a sliding
+        # window of samples: if we start decoding a stream in the middle, the
+        # first few samples we're decoding aren't able to take advantage of the
+        # preceeding samples for sample-rate conversion. This leads to a
+        # slightly different sample-rate conversion that we would otherwise get,
+        # had we started the stream from the beginning.
+        atol = 1e-6 if start_seconds == 0 else 1e-2
+        rtol = 1e-6
+
+        # Upsample
+        decoder = AudioDecoder(SINE_MONO_S32_44100.path)
+        assert decoder.metadata.sample_rate == 44_100
+        frames_44100_native = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+        assert frames_44100_native.sample_rate == 44_100
+
+        decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=44_100)
+        frames_upsampled_to_44100 = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+        assert decoder.metadata.sample_rate == 16_000
+        assert frames_upsampled_to_44100.sample_rate == 44_100
+
+        torch.testing.assert_close(
+            frames_upsampled_to_44100.data,
+            frames_44100_native.data,
+            atol=atol,
+            rtol=rtol,
+        )
+
+        # Downsample
+        decoder = AudioDecoder(SINE_MONO_S32_8000.path)
+        assert decoder.metadata.sample_rate == 8000
+        frames_8000_native = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+        assert frames_8000_native.sample_rate == 8000
+
+        decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=8000)
+        frames_downsampled_to_8000 = decoder.get_samples_played_in_range(
+            start_seconds=start_seconds, stop_seconds=stop_seconds
+        )
+        assert decoder.metadata.sample_rate == 16_000
+        assert frames_downsampled_to_8000.sample_rate == 8000
+
+        torch.testing.assert_close(
+            frames_downsampled_to_8000.data,
+            frames_8000_native.data,
+            atol=atol,
+            rtol=rtol,
+        )
+
     def test_s16_ffmpeg4_bug(self):
         # s16 fails on FFmpeg4 but can be decoded on other versions.
         # Debugging logs show that we're hitting:
diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py
index 9088539f..264417c3 100644
--- a/test/decoders/test_ops.py
+++ b/test/decoders/test_ops.py
@@ -44,6 +44,9 @@
     NASA_AUDIO_MP3,
     NASA_VIDEO,
     needs_cuda,
+    SINE_MONO_S32,
+    SINE_MONO_S32_44100,
+    SINE_MONO_S32_8000,
 )
 
 torch._dynamo.config.capture_dynamic_output_shape_ops = True
@@ -880,6 +883,33 @@ def test_decode_before_frame_start(self):
         # TODO fix this. `frames` should be empty.
         torch.testing.assert_close(frames, all_frames)
 
+    def test_sample_rate_conversion(self):
+        def get_all_frames(asset, sample_rate=None, stop_seconds=None):
+            decoder = create_from_file(str(asset.path), seek_mode="approximate")
+            add_audio_stream(decoder, sample_rate=sample_rate)
+            frames, *_ = get_frames_by_pts_in_range_audio(
+                decoder, start_seconds=0, stop_seconds=stop_seconds
+            )
+            return frames
+
+        # Upsample
+        assert SINE_MONO_S32_44100.sample_rate == 44_100
+        frames_44100_native = get_all_frames(SINE_MONO_S32_44100)
+
+        assert SINE_MONO_S32.sample_rate == 16_000
+        frames_upsampled_to_44100 = get_all_frames(SINE_MONO_S32, sample_rate=44_100)
+
+        torch.testing.assert_close(frames_upsampled_to_44100, frames_44100_native)
+
+        # Downsample
+        assert SINE_MONO_S32_8000.sample_rate == 8000
+        frames_8000_native = get_all_frames(SINE_MONO_S32_8000)
+
+        assert SINE_MONO_S32.sample_rate == 16_000
+        frames_downsampled_to_8000 = get_all_frames(SINE_MONO_S32, sample_rate=8000)
+
+        torch.testing.assert_close(frames_downsampled_to_8000, frames_8000_native)
+
 
 if __name__ == "__main__":
     pytest.main()
diff --git a/test/resources/sine_mono_s32_44100.wav b/test/resources/sine_mono_s32_44100.wav
new file mode 100644
index 00000000..3d4162d2
Binary files /dev/null and b/test/resources/sine_mono_s32_44100.wav differ
diff --git a/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json
new file mode 100644
index 00000000..f70551f1
--- /dev/null
+++ b/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json
@@ -0,0 +1,694 @@
+[
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.000000"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.023220"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.046440"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.069660"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.092880"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.116100"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.139320"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.162540"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.185760"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.208980"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.232200"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.255420"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.278639"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.301859"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.325079"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.348299"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.371519"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.394739"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.417959"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.441179"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.464399"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.487619"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.510839"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.534059"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.557279"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.580499"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.603719"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.626939"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.650159"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.673379"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.696599"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.719819"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.743039"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.766259"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.789478"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.812698"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.835918"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.859138"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.882358"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.905578"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.928798"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.952018"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.975238"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "0.998458"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.021678"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.044898"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.068118"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.091338"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.114558"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.137778"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.160998"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.184218"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.207438"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.230658"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.253878"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.277098"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.300317"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.323537"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.346757"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.369977"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.393197"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.416417"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.439637"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.462857"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.486077"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.509297"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.532517"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.555737"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.578957"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.602177"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.625397"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.648617"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.671837"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.695057"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.718277"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.741497"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.764717"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.787937"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.811156"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.834376"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.857596"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.880816"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.904036"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.927256"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.950476"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.973696"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "1.996916"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.020136"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.043356"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.066576"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.089796"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.113016"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.136236"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.159456"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.182676"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.205896"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.229116"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.252336"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.275556"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.298776"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.321995"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.345215"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.368435"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.391655"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.414875"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.438095"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.461315"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.484535"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.507755"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.530975"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.554195"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.577415"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.600635"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.623855"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.647075"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.670295"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.693515"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.716735"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.739955"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.763175"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.786395"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.809615"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.832834"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.856054"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.879274"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.902494"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.925714"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.948934"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.972154"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "2.995374"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.018594"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.041814"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.065034"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.088254"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.111474"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.134694"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.157914"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.181134"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.204354"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.227574"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.250794"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.274014"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.297234"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.320454"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.343673"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.366893"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.390113"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.413333"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.436553"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.459773"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.482993"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.506213"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.529433"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.552653"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.575873"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.599093"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.622313"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.645533"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.668753"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.691973"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.715193"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.738413"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.761633"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.784853"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.808073"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.831293"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.854512"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.877732"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.900952"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.924172"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.947392"
+  },
+  {
+    "duration_time": "0.023220",
+    "pts_time": "3.970612"
+  },
+  {
+    "duration_time": "0.006168",
+    "pts_time": "3.993832"
+  }
+]
diff --git a/test/resources/sine_mono_s32_8000.wav b/test/resources/sine_mono_s32_8000.wav
new file mode 100644
index 00000000..ccc5e671
Binary files /dev/null and b/test/resources/sine_mono_s32_8000.wav differ
diff --git a/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json
new file mode 100644
index 00000000..689def1e
--- /dev/null
+++ b/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json
@@ -0,0 +1,130 @@
+[
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.000000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.128000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.256000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.384000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.512000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.640000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.768000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.896000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.024000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.152000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.280000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.408000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.536000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.664000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.792000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.920000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.048000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.176000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.304000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.432000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.560000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.688000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.816000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.944000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.072000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.200000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.328000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.456000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.584000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.712000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.840000"
+  },
+  {
+    "duration_time": "0.032000",
+    "pts_time": "3.968000"
+  }
+]
diff --git a/test/utils.py b/test/utils.py
index efee0deb..d1cdd47d 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -462,6 +462,39 @@ def sample_format(self) -> str:
     },
 )
 
+# This file is an upsampled version of SINE_MONO_S32, generated with:
+# ffmpeg -i test/resources/sine_mono_s32.wav -ar 44100 -c:a pcm_s32le test/resources/sine_mono_s32_44100.wav
+SINE_MONO_S32_44100 = TestAudio(
+    filename="sine_mono_s32_44100.wav",
+    default_stream_index=0,
+    frames={},  # Automatically loaded from json file
+    stream_infos={
+        0: TestAudioStreamInfo(
+            sample_rate=44_100,
+            num_channels=1,
+            duration_seconds=4,
+            num_frames=173,
+            sample_format="s32",
+        )
+    },
+)
+
+# This file is a downsampled version of SINE_MONO_S32, generated with:
+# ffmpeg -i test/resources/sine_mono_s32.wav -ar 8000 -c:a pcm_s32le test/resources/sine_mono_s32_8000.wav
+SINE_MONO_S32_8000 = TestAudio(
+    filename="sine_mono_s32_8000.wav",
+    default_stream_index=0,
+    frames={},  # Automatically loaded from json file
+    stream_infos={
+        0: TestAudioStreamInfo(
+            sample_rate=8000,
+            num_channels=1,
+            duration_seconds=4,
+            num_frames=32,
+            sample_format="s32",
+        )
+    },
+)
 
 # Same sample rate as SINE_MONO_S32, but encoded as s16 instead of s32. Generated with:
 # ffmpeg -i test/resources/sine_mono_s32.wav -ar 16000 -c:a pcm_s16le test/resources/sine_mono_s16.wav