diff --git a/src/torchcodec/decoders/_audio_decoder.py b/src/torchcodec/decoders/_audio_decoder.py index 34292751..46cee62b 100644 --- a/src/torchcodec/decoders/_audio_decoder.py +++ b/src/torchcodec/decoders/_audio_decoder.py @@ -25,10 +25,13 @@ def __init__( source: Union[str, Path, bytes, Tensor], *, stream_index: Optional[int] = None, + sample_rate: Optional[int] = None, ): self._decoder = create_decoder(source=source, seek_mode="approximate") - core.add_audio_stream(self._decoder, stream_index=stream_index) + core.add_audio_stream( + self._decoder, stream_index=stream_index, sample_rate=sample_rate + ) ( self.metadata, @@ -39,6 +42,9 @@ def __init__( decoder=self._decoder, stream_index=stream_index, media_type="audio" ) assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy + self._desired_sample_rate = ( + sample_rate if sample_rate is not None else self.metadata.sample_rate + ) def get_samples_played_in_range( self, start_seconds: float, stop_seconds: Optional[float] = None @@ -75,11 +81,7 @@ def get_samples_played_in_range( # So we do some basic math to figure out the position of the view that # we'll return. - # TODO: sample_rate is either the original one from metadata, or the - # user-specified one (NIY) - assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy - sample_rate = self.metadata.sample_rate - + sample_rate = self._desired_sample_rate # TODO: metadata's sample_rate should probably not be Optional assert sample_rate is not None # mypy. @@ -94,7 +96,7 @@ def get_samples_played_in_range( output_pts_seconds = first_pts num_samples = frames.shape[1] - last_pts = first_pts + num_samples / self.metadata.sample_rate + last_pts = first_pts + num_samples / sample_rate if stop_seconds is not None and stop_seconds < last_pts: offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate) else: diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp index eb82c5a2..1e3a1421 100644 --- a/src/torchcodec/decoders/_core/FFMPEGCommon.cpp +++ b/src/torchcodec/decoders/_core/FFMPEGCommon.cpp @@ -86,9 +86,10 @@ void setChannelLayout( SwrContext* allocateSwrContext( UniqueAVCodecContext& avCodecContext, - int sampleRate, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat) { + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate) { SwrContext* swrContext = nullptr; #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4 AVChannelLayout layout = avCodecContext->ch_layout; @@ -96,10 +97,10 @@ SwrContext* allocateSwrContext( &swrContext, &layout, desiredSampleFormat, - sampleRate, + desiredSampleRate, &layout, sourceSampleFormat, - sampleRate, + sourceSampleRate, 0, nullptr); @@ -113,10 +114,10 @@ SwrContext* allocateSwrContext( nullptr, layout, desiredSampleFormat, - sampleRate, + desiredSampleRate, layout, sourceSampleFormat, - sampleRate, + sourceSampleRate, 0, nullptr); #endif diff --git a/src/torchcodec/decoders/_core/FFMPEGCommon.h b/src/torchcodec/decoders/_core/FFMPEGCommon.h index 382563aa..c61ae287 100644 --- a/src/torchcodec/decoders/_core/FFMPEGCommon.h +++ b/src/torchcodec/decoders/_core/FFMPEGCommon.h @@ -149,9 +149,10 @@ void setChannelLayout( const UniqueAVFrame& srcAVFrame); SwrContext* allocateSwrContext( UniqueAVCodecContext& avCodecContext, - int sampleRate, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat); + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate); // Returns true if sws_scale can handle unaligned data. bool canSwsScaleHandleUnalignedData(); diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp index d4d32058..f451ee58 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoder.cpp @@ -580,7 +580,9 @@ void VideoDecoder::addVideoStream( videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary); } -void VideoDecoder::addAudioStream(int streamIndex) { +void VideoDecoder::addAudioStream( + int streamIndex, + const AudioStreamOptions& audioStreamOptions) { TORCH_CHECK( seekMode_ == SeekMode::approximate, "seek_mode must be 'approximate' for audio streams."); @@ -588,6 +590,8 @@ void VideoDecoder::addAudioStream(int streamIndex) { addStream(streamIndex, AVMEDIA_TYPE_AUDIO); auto& streamInfo = streamInfos_[activeStreamIndex_]; + streamInfo.audioStreamOptions = audioStreamOptions; + auto& streamMetadata = containerMetadata_.allStreamMetadata[activeStreamIndex_]; streamMetadata.sampleRate = @@ -947,6 +951,11 @@ VideoDecoder::AudioFramesOutput VideoDecoder::getFramesPlayedInRangeAudio( (stopPts <= lastDecodedAvFrameEnd); } + auto lastSamples = maybeFlushSwrBuffers(); + if (lastSamples.has_value()) { + frames.push_back(*lastSamples); + } + return AudioFramesOutput{torch::cat(frames, 1), firstFramePtsSeconds}; } @@ -1200,8 +1209,7 @@ VideoDecoder::FrameOutput VideoDecoder::convertAVFrameToFrameOutput( getDuration(avFrame), formatContext_->streams[activeStreamIndex_]->time_base); if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) { - convertAudioAVFrameToFrameOutputOnCPU( - avFrame, frameOutput, preAllocatedOutputTensor); + convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput); } else if (streamInfo.videoStreamOptions.device.type() == torch::kCPU) { convertAVFrameToFrameOutputOnCPU( avFrame, frameOutput, preAllocatedOutputTensor); @@ -1379,24 +1387,30 @@ torch::Tensor VideoDecoder::convertAVFrameToTensorUsingFilterGraph( void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU( UniqueAVFrame& srcAVFrame, - FrameOutput& frameOutput, - std::optional preAllocatedOutputTensor) { - TORCH_CHECK( - !preAllocatedOutputTensor.has_value(), - "pre-allocated audio tensor not supported yet."); - + FrameOutput& frameOutput) { AVSampleFormat sourceSampleFormat = static_cast(srcAVFrame->format); AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP; + int sourceSampleRate = srcAVFrame->sample_rate; + int desiredSampleRate = + streamInfos_[activeStreamIndex_].audioStreamOptions.sampleRate.value_or( + sourceSampleRate); + + bool mustConvert = + (sourceSampleFormat != desiredSampleFormat || + sourceSampleRate != desiredSampleRate); + UniqueAVFrame convertedAVFrame; - if (sourceSampleFormat != desiredSampleFormat) { - convertedAVFrame = convertAudioAVFrameSampleFormat( - srcAVFrame, sourceSampleFormat, desiredSampleFormat); + if (mustConvert) { + convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate( + srcAVFrame, + sourceSampleFormat, + desiredSampleFormat, + sourceSampleRate, + desiredSampleRate); } - const UniqueAVFrame& avFrame = (sourceSampleFormat != desiredSampleFormat) - ? convertedAVFrame - : srcAVFrame; + const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame; AVSampleFormat format = static_cast(avFrame->format); TORCH_CHECK( @@ -1419,23 +1433,25 @@ void VideoDecoder::convertAudioAVFrameToFrameOutputOnCPU( memcpy( outputChannelData, avFrame->extended_data[channel], numBytesPerChannel); } + frameOutput.data = outputData; } -UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat( - const UniqueAVFrame& avFrame, +UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormatAndSampleRate( + const UniqueAVFrame& srcAVFrame, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat - -) { + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate) { auto& streamInfo = streamInfos_[activeStreamIndex_]; - const auto& streamMetadata = - containerMetadata_.allStreamMetadata[activeStreamIndex_]; - int sampleRate = static_cast(streamMetadata.sampleRate.value()); if (!streamInfo.swrContext) { createSwrContext( - streamInfo, sampleRate, sourceSampleFormat, desiredSampleFormat); + streamInfo, + sourceSampleFormat, + desiredSampleFormat, + sourceSampleRate, + desiredSampleRate); } UniqueAVFrame convertedAVFrame(av_frame_alloc()); @@ -1443,10 +1459,27 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat( convertedAVFrame, "Could not allocate frame for sample format conversion."); - setChannelLayout(convertedAVFrame, avFrame); + setChannelLayout(convertedAVFrame, srcAVFrame); convertedAVFrame->format = static_cast(desiredSampleFormat); - convertedAVFrame->sample_rate = avFrame->sample_rate; - convertedAVFrame->nb_samples = avFrame->nb_samples; + convertedAVFrame->sample_rate = desiredSampleRate; + if (sourceSampleRate != desiredSampleRate) { + // Note that this is an upper bound on the number of output samples. + // `swr_convert()` will likely not fill convertedAVFrame with that many + // samples if sample rate conversion is needed. It will buffer the last few + // ones because those require future samples. That's also why we reset + // nb_samples after the call to `swr_convert()`. + // We could also use `swr_get_out_samples()` to determine the number of + // output samples, but empirically `av_rescale_rnd()` seems to provide a + // tighter bound. + convertedAVFrame->nb_samples = av_rescale_rnd( + swr_get_delay(streamInfo.swrContext.get(), sourceSampleRate) + + srcAVFrame->nb_samples, + desiredSampleRate, + sourceSampleRate, + AV_ROUND_UP); + } else { + convertedAVFrame->nb_samples = srcAVFrame->nb_samples; + } auto status = av_frame_get_buffer(convertedAVFrame.get(), 0); TORCH_CHECK( @@ -1454,20 +1487,56 @@ UniqueAVFrame VideoDecoder::convertAudioAVFrameSampleFormat( "Could not allocate frame buffers for sample format conversion: ", getFFMPEGErrorStringFromErrorCode(status)); - auto numSampleConverted = swr_convert( + auto numConvertedSamples = swr_convert( streamInfo.swrContext.get(), convertedAVFrame->data, convertedAVFrame->nb_samples, - static_cast(const_cast(avFrame->data)), - avFrame->nb_samples); + static_cast( + const_cast(srcAVFrame->data)), + srcAVFrame->nb_samples); TORCH_CHECK( - numSampleConverted > 0, + numConvertedSamples > 0, "Error in swr_convert: ", - getFFMPEGErrorStringFromErrorCode(numSampleConverted)); + getFFMPEGErrorStringFromErrorCode(numConvertedSamples)); + + // See comment above about nb_samples + convertedAVFrame->nb_samples = numConvertedSamples; return convertedAVFrame; } +std::optional VideoDecoder::maybeFlushSwrBuffers() { + // When sample rate conversion is involved, swresample buffers some of the + // samples in-between calls to swr_convert (see the libswresample docs). + // That's because the last few samples in a given frame require future samples + // from the next frame to be properly converted. This function flushes out the + // samples that are stored in swresample's buffers. + auto& streamInfo = streamInfos_[activeStreamIndex_]; + if (!streamInfo.swrContext) { + return std::nullopt; + } + auto numRemainingSamples = // this is an upper bound + swr_get_out_samples(streamInfo.swrContext.get(), 0); + + if (numRemainingSamples == 0) { + return std::nullopt; + } + + torch::Tensor lastSamples = torch::empty( + {getNumChannels(streamInfo.codecContext), numRemainingSamples}, + torch::kFloat32); + uint8_t* lastSamplesData = static_cast(lastSamples.data_ptr()); + + auto actualNumRemainingSamples = swr_convert( + streamInfo.swrContext.get(), + &lastSamplesData, + numRemainingSamples, + nullptr, + 0); + return lastSamples.narrow( + /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples); +} + // -------------------------------------------------------------------------- // OUTPUT ALLOCATION AND SHAPE CONVERSION // -------------------------------------------------------------------------- @@ -1703,14 +1772,16 @@ void VideoDecoder::createSwsContext( void VideoDecoder::createSwrContext( StreamInfo& streamInfo, - int sampleRate, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat) { + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate) { auto swrContext = allocateSwrContext( streamInfo.codecContext, - sampleRate, sourceSampleFormat, - desiredSampleFormat); + desiredSampleFormat, + sourceSampleRate, + desiredSampleRate); auto status = swr_init(swrContext); TORCH_CHECK( diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h index 8259a785..bc810952 100644 --- a/src/torchcodec/decoders/_core/VideoDecoder.h +++ b/src/torchcodec/decoders/_core/VideoDecoder.h @@ -139,10 +139,18 @@ class VideoDecoder { torch::Device device = torch::kCPU; }; + struct AudioStreamOptions { + AudioStreamOptions() {} + + std::optional sampleRate; + }; + void addVideoStream( int streamIndex, const VideoStreamOptions& videoStreamOptions = VideoStreamOptions()); - void addAudioStream(int streamIndex); + void addAudioStream( + int streamIndex, + const AudioStreamOptions& audioStreamOptions = AudioStreamOptions()); // -------------------------------------------------------------------------- // DECODING AND SEEKING APIs @@ -336,6 +344,7 @@ class VideoDecoder { int64_t lastDecodedAvFramePts = 0; int64_t lastDecodedAvFrameDuration = 0; VideoStreamOptions videoStreamOptions; + AudioStreamOptions audioStreamOptions; // color-conversion fields. Only one of FilterGraphContext and // UniqueSwsContext should be non-null. @@ -383,8 +392,7 @@ class VideoDecoder { void convertAudioAVFrameToFrameOutputOnCPU( UniqueAVFrame& srcAVFrame, - FrameOutput& frameOutput, - std::optional preAllocatedOutputTensor = std::nullopt); + FrameOutput& frameOutput); torch::Tensor convertAVFrameToTensorUsingFilterGraph( const UniqueAVFrame& avFrame); @@ -393,10 +401,14 @@ class VideoDecoder { const UniqueAVFrame& avFrame, torch::Tensor& outputTensor); - UniqueAVFrame convertAudioAVFrameSampleFormat( - const UniqueAVFrame& avFrame, + UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate( + const UniqueAVFrame& srcAVFrame, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat); + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate); + + std::optional maybeFlushSwrBuffers(); // -------------------------------------------------------------------------- // COLOR CONVERSION LIBRARIES HANDLERS CREATION @@ -414,9 +426,10 @@ class VideoDecoder { void createSwrContext( StreamInfo& streamInfo, - int sampleRate, AVSampleFormat sourceSampleFormat, - AVSampleFormat desiredSampleFormat); + AVSampleFormat desiredSampleFormat, + int sourceSampleRate, + int desiredSampleRate); // -------------------------------------------------------------------------- // PTS <-> INDEX CONVERSIONS diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp index fffb1118..adbed7ca 100644 --- a/src/torchcodec/decoders/_core/VideoDecoderOps.cpp +++ b/src/torchcodec/decoders/_core/VideoDecoderOps.cpp @@ -34,7 +34,7 @@ TORCH_LIBRARY(torchcodec_ns, m) { m.def( "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None) -> ()"); m.def( - "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None) -> ()"); + "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None) -> ()"); m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()"); m.def("get_next_frame(Tensor(a!) decoder) -> (Tensor, Tensor, Tensor)"); m.def( @@ -220,9 +220,13 @@ void _add_video_stream( void add_audio_stream( at::Tensor& decoder, - std::optional stream_index) { + std::optional stream_index, + std::optional sample_rate) { + VideoDecoder::AudioStreamOptions audioStreamOptions; + audioStreamOptions.sampleRate = sample_rate; + auto videoDecoder = unwrapTensorToGetDecoder(decoder); - videoDecoder->addAudioStream(stream_index.value_or(-1)); + videoDecoder->addAudioStream(stream_index.value_or(-1), audioStreamOptions); } void seek_to_pts(at::Tensor& decoder, double seconds) { diff --git a/src/torchcodec/decoders/_core/VideoDecoderOps.h b/src/torchcodec/decoders/_core/VideoDecoderOps.h index a77dec66..bc7f2036 100644 --- a/src/torchcodec/decoders/_core/VideoDecoderOps.h +++ b/src/torchcodec/decoders/_core/VideoDecoderOps.h @@ -50,7 +50,8 @@ void _add_video_stream( void add_audio_stream( at::Tensor& decoder, - std::optional stream_index = std::nullopt); + std::optional stream_index = std::nullopt, + std::optional sample_rate = std::nullopt); // Seek to a particular presentation timestamp in the video in seconds. void seek_to_pts(at::Tensor& decoder, double seconds); diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py index a2d70f51..6885b51a 100644 --- a/test/decoders/test_decoders.py +++ b/test/decoders/test_decoders.py @@ -27,6 +27,8 @@ NASA_VIDEO, SINE_MONO_S16, SINE_MONO_S32, + SINE_MONO_S32_44100, + SINE_MONO_S32_8000, ) @@ -1090,6 +1092,71 @@ def test_format_conversion(self): reference_frames = asset.get_frame_data_by_range(start=0, stop=asset.num_frames) torch.testing.assert_close(all_samples.data, reference_frames) + @pytest.mark.parametrize( + "start_seconds, stop_seconds", + ( + (0, None), + (0, 4), + (0, 3), + (2, None), + (2, 3), + ), + ) + def test_sample_rate_conversion(self, start_seconds, stop_seconds): + # When start_seconds is not exactly 0, we have to increase the tolerance + # a bit. This is because sample_rate conversion relies on a sliding + # window of samples: if we start decoding a stream in the middle, the + # first few samples we're decoding aren't able to take advantage of the + # preceeding samples for sample-rate conversion. This leads to a + # slightly different sample-rate conversion that we would otherwise get, + # had we started the stream from the beginning. + atol = 1e-6 if start_seconds == 0 else 1e-2 + rtol = 1e-6 + + # Upsample + decoder = AudioDecoder(SINE_MONO_S32_44100.path) + assert decoder.metadata.sample_rate == 44_100 + frames_44100_native = decoder.get_samples_played_in_range( + start_seconds=start_seconds, stop_seconds=stop_seconds + ) + assert frames_44100_native.sample_rate == 44_100 + + decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=44_100) + frames_upsampled_to_44100 = decoder.get_samples_played_in_range( + start_seconds=start_seconds, stop_seconds=stop_seconds + ) + assert decoder.metadata.sample_rate == 16_000 + assert frames_upsampled_to_44100.sample_rate == 44_100 + + torch.testing.assert_close( + frames_upsampled_to_44100.data, + frames_44100_native.data, + atol=atol, + rtol=rtol, + ) + + # Downsample + decoder = AudioDecoder(SINE_MONO_S32_8000.path) + assert decoder.metadata.sample_rate == 8000 + frames_8000_native = decoder.get_samples_played_in_range( + start_seconds=start_seconds, stop_seconds=stop_seconds + ) + assert frames_8000_native.sample_rate == 8000 + + decoder = AudioDecoder(SINE_MONO_S32.path, sample_rate=8000) + frames_downsampled_to_8000 = decoder.get_samples_played_in_range( + start_seconds=start_seconds, stop_seconds=stop_seconds + ) + assert decoder.metadata.sample_rate == 16_000 + assert frames_downsampled_to_8000.sample_rate == 8000 + + torch.testing.assert_close( + frames_downsampled_to_8000.data, + frames_8000_native.data, + atol=atol, + rtol=rtol, + ) + def test_s16_ffmpeg4_bug(self): # s16 fails on FFmpeg4 but can be decoded on other versions. # Debugging logs show that we're hitting: diff --git a/test/decoders/test_ops.py b/test/decoders/test_ops.py index 9088539f..264417c3 100644 --- a/test/decoders/test_ops.py +++ b/test/decoders/test_ops.py @@ -44,6 +44,9 @@ NASA_AUDIO_MP3, NASA_VIDEO, needs_cuda, + SINE_MONO_S32, + SINE_MONO_S32_44100, + SINE_MONO_S32_8000, ) torch._dynamo.config.capture_dynamic_output_shape_ops = True @@ -880,6 +883,33 @@ def test_decode_before_frame_start(self): # TODO fix this. `frames` should be empty. torch.testing.assert_close(frames, all_frames) + def test_sample_rate_conversion(self): + def get_all_frames(asset, sample_rate=None, stop_seconds=None): + decoder = create_from_file(str(asset.path), seek_mode="approximate") + add_audio_stream(decoder, sample_rate=sample_rate) + frames, *_ = get_frames_by_pts_in_range_audio( + decoder, start_seconds=0, stop_seconds=stop_seconds + ) + return frames + + # Upsample + assert SINE_MONO_S32_44100.sample_rate == 44_100 + frames_44100_native = get_all_frames(SINE_MONO_S32_44100) + + assert SINE_MONO_S32.sample_rate == 16_000 + frames_upsampled_to_44100 = get_all_frames(SINE_MONO_S32, sample_rate=44_100) + + torch.testing.assert_close(frames_upsampled_to_44100, frames_44100_native) + + # Downsample + assert SINE_MONO_S32_8000.sample_rate == 8000 + frames_8000_native = get_all_frames(SINE_MONO_S32_8000) + + assert SINE_MONO_S32.sample_rate == 16_000 + frames_downsampled_to_8000 = get_all_frames(SINE_MONO_S32, sample_rate=8000) + + torch.testing.assert_close(frames_downsampled_to_8000, frames_8000_native) + if __name__ == "__main__": pytest.main() diff --git a/test/resources/sine_mono_s32_44100.wav b/test/resources/sine_mono_s32_44100.wav new file mode 100644 index 00000000..3d4162d2 Binary files /dev/null and b/test/resources/sine_mono_s32_44100.wav differ diff --git a/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json new file mode 100644 index 00000000..f70551f1 --- /dev/null +++ b/test/resources/sine_mono_s32_44100.wav.stream0.all_frames_info.json @@ -0,0 +1,694 @@ +[ + { + "duration_time": "0.023220", + "pts_time": "0.000000" + }, + { + "duration_time": "0.023220", + "pts_time": "0.023220" + }, + { + "duration_time": "0.023220", + "pts_time": "0.046440" + }, + { + "duration_time": "0.023220", + "pts_time": "0.069660" + }, + { + "duration_time": "0.023220", + "pts_time": "0.092880" + }, + { + "duration_time": "0.023220", + "pts_time": "0.116100" + }, + { + "duration_time": "0.023220", + "pts_time": "0.139320" + }, + { + "duration_time": "0.023220", + "pts_time": "0.162540" + }, + { + "duration_time": "0.023220", + "pts_time": "0.185760" + }, + { + "duration_time": "0.023220", + "pts_time": "0.208980" + }, + { + "duration_time": "0.023220", + "pts_time": "0.232200" + }, + { + "duration_time": "0.023220", + "pts_time": "0.255420" + }, + { + "duration_time": "0.023220", + "pts_time": "0.278639" + }, + { + "duration_time": "0.023220", + "pts_time": "0.301859" + }, + { + "duration_time": "0.023220", + "pts_time": "0.325079" + }, + { + "duration_time": "0.023220", + "pts_time": "0.348299" + }, + { + "duration_time": "0.023220", + "pts_time": "0.371519" + }, + { + "duration_time": "0.023220", + "pts_time": "0.394739" + }, + { + "duration_time": "0.023220", + "pts_time": "0.417959" + }, + { + "duration_time": "0.023220", + "pts_time": "0.441179" + }, + { + "duration_time": "0.023220", + "pts_time": "0.464399" + }, + { + "duration_time": "0.023220", + "pts_time": "0.487619" + }, + { + "duration_time": "0.023220", + "pts_time": "0.510839" + }, + { + "duration_time": "0.023220", + "pts_time": "0.534059" + }, + { + "duration_time": "0.023220", + "pts_time": "0.557279" + }, + { + "duration_time": "0.023220", + "pts_time": "0.580499" + }, + { + "duration_time": "0.023220", + "pts_time": "0.603719" + }, + { + "duration_time": "0.023220", + "pts_time": "0.626939" + }, + { + "duration_time": "0.023220", + "pts_time": "0.650159" + }, + { + "duration_time": "0.023220", + "pts_time": "0.673379" + }, + { + "duration_time": "0.023220", + "pts_time": "0.696599" + }, + { + "duration_time": "0.023220", + "pts_time": "0.719819" + }, + { + "duration_time": "0.023220", + "pts_time": "0.743039" + }, + { + "duration_time": "0.023220", + "pts_time": "0.766259" + }, + { + "duration_time": "0.023220", + "pts_time": "0.789478" + }, + { + "duration_time": "0.023220", + "pts_time": "0.812698" + }, + { + "duration_time": "0.023220", + "pts_time": "0.835918" + }, + { + "duration_time": "0.023220", + "pts_time": "0.859138" + }, + { + "duration_time": "0.023220", + "pts_time": "0.882358" + }, + { + "duration_time": "0.023220", + "pts_time": "0.905578" + }, + { + "duration_time": "0.023220", + "pts_time": "0.928798" + }, + { + "duration_time": "0.023220", + "pts_time": "0.952018" + }, + { + "duration_time": "0.023220", + "pts_time": "0.975238" + }, + { + "duration_time": "0.023220", + "pts_time": "0.998458" + }, + { + "duration_time": "0.023220", + "pts_time": "1.021678" + }, + { + "duration_time": "0.023220", + "pts_time": "1.044898" + }, + { + "duration_time": "0.023220", + "pts_time": "1.068118" + }, + { + "duration_time": "0.023220", + "pts_time": "1.091338" + }, + { + "duration_time": "0.023220", + "pts_time": "1.114558" + }, + { + "duration_time": "0.023220", + "pts_time": "1.137778" + }, + { + "duration_time": "0.023220", + "pts_time": "1.160998" + }, + { + "duration_time": "0.023220", + "pts_time": "1.184218" + }, + { + "duration_time": "0.023220", + "pts_time": "1.207438" + }, + { + "duration_time": "0.023220", + "pts_time": "1.230658" + }, + { + "duration_time": "0.023220", + "pts_time": "1.253878" + }, + { + "duration_time": "0.023220", + "pts_time": "1.277098" + }, + { + "duration_time": "0.023220", + "pts_time": "1.300317" + }, + { + "duration_time": "0.023220", + "pts_time": "1.323537" + }, + { + "duration_time": "0.023220", + "pts_time": "1.346757" + }, + { + "duration_time": "0.023220", + "pts_time": "1.369977" + }, + { + "duration_time": "0.023220", + "pts_time": "1.393197" + }, + { + "duration_time": "0.023220", + "pts_time": "1.416417" + }, + { + "duration_time": "0.023220", + "pts_time": "1.439637" + }, + { + "duration_time": "0.023220", + "pts_time": "1.462857" + }, + { + "duration_time": "0.023220", + "pts_time": "1.486077" + }, + { + "duration_time": "0.023220", + "pts_time": "1.509297" + }, + { + "duration_time": "0.023220", + "pts_time": "1.532517" + }, + { + "duration_time": "0.023220", + "pts_time": "1.555737" + }, + { + "duration_time": "0.023220", + "pts_time": "1.578957" + }, + { + "duration_time": "0.023220", + "pts_time": "1.602177" + }, + { + "duration_time": "0.023220", + "pts_time": "1.625397" + }, + { + "duration_time": "0.023220", + "pts_time": "1.648617" + }, + { + "duration_time": "0.023220", + "pts_time": "1.671837" + }, + { + "duration_time": "0.023220", + "pts_time": "1.695057" + }, + { + "duration_time": "0.023220", + "pts_time": "1.718277" + }, + { + "duration_time": "0.023220", + "pts_time": "1.741497" + }, + { + "duration_time": "0.023220", + "pts_time": "1.764717" + }, + { + "duration_time": "0.023220", + "pts_time": "1.787937" + }, + { + "duration_time": "0.023220", + "pts_time": "1.811156" + }, + { + "duration_time": "0.023220", + "pts_time": "1.834376" + }, + { + "duration_time": "0.023220", + "pts_time": "1.857596" + }, + { + "duration_time": "0.023220", + "pts_time": "1.880816" + }, + { + "duration_time": "0.023220", + "pts_time": "1.904036" + }, + { + "duration_time": "0.023220", + "pts_time": "1.927256" + }, + { + "duration_time": "0.023220", + "pts_time": "1.950476" + }, + { + "duration_time": "0.023220", + "pts_time": "1.973696" + }, + { + "duration_time": "0.023220", + "pts_time": "1.996916" + }, + { + "duration_time": "0.023220", + "pts_time": "2.020136" + }, + { + "duration_time": "0.023220", + "pts_time": "2.043356" + }, + { + "duration_time": "0.023220", + "pts_time": "2.066576" + }, + { + "duration_time": "0.023220", + "pts_time": "2.089796" + }, + { + "duration_time": "0.023220", + "pts_time": "2.113016" + }, + { + "duration_time": "0.023220", + "pts_time": "2.136236" + }, + { + "duration_time": "0.023220", + "pts_time": "2.159456" + }, + { + "duration_time": "0.023220", + "pts_time": "2.182676" + }, + { + "duration_time": "0.023220", + "pts_time": "2.205896" + }, + { + "duration_time": "0.023220", + "pts_time": "2.229116" + }, + { + "duration_time": "0.023220", + "pts_time": "2.252336" + }, + { + "duration_time": "0.023220", + "pts_time": "2.275556" + }, + { + "duration_time": "0.023220", + "pts_time": "2.298776" + }, + { + "duration_time": "0.023220", + "pts_time": "2.321995" + }, + { + "duration_time": "0.023220", + "pts_time": "2.345215" + }, + { + "duration_time": "0.023220", + "pts_time": "2.368435" + }, + { + "duration_time": "0.023220", + "pts_time": "2.391655" + }, + { + "duration_time": "0.023220", + "pts_time": "2.414875" + }, + { + "duration_time": "0.023220", + "pts_time": "2.438095" + }, + { + "duration_time": "0.023220", + "pts_time": "2.461315" + }, + { + "duration_time": "0.023220", + "pts_time": "2.484535" + }, + { + "duration_time": "0.023220", + "pts_time": "2.507755" + }, + { + "duration_time": "0.023220", + "pts_time": "2.530975" + }, + { + "duration_time": "0.023220", + "pts_time": "2.554195" + }, + { + "duration_time": "0.023220", + "pts_time": "2.577415" + }, + { + "duration_time": "0.023220", + "pts_time": "2.600635" + }, + { + "duration_time": "0.023220", + "pts_time": "2.623855" + }, + { + "duration_time": "0.023220", + "pts_time": "2.647075" + }, + { + "duration_time": "0.023220", + "pts_time": "2.670295" + }, + { + "duration_time": "0.023220", + "pts_time": "2.693515" + }, + { + "duration_time": "0.023220", + "pts_time": "2.716735" + }, + { + "duration_time": "0.023220", + "pts_time": "2.739955" + }, + { + "duration_time": "0.023220", + "pts_time": "2.763175" + }, + { + "duration_time": "0.023220", + "pts_time": "2.786395" + }, + { + "duration_time": "0.023220", + "pts_time": "2.809615" + }, + { + "duration_time": "0.023220", + "pts_time": "2.832834" + }, + { + "duration_time": "0.023220", + "pts_time": "2.856054" + }, + { + "duration_time": "0.023220", + "pts_time": "2.879274" + }, + { + "duration_time": "0.023220", + "pts_time": "2.902494" + }, + { + "duration_time": "0.023220", + "pts_time": "2.925714" + }, + { + "duration_time": "0.023220", + "pts_time": "2.948934" + }, + { + "duration_time": "0.023220", + "pts_time": "2.972154" + }, + { + "duration_time": "0.023220", + "pts_time": "2.995374" + }, + { + "duration_time": "0.023220", + "pts_time": "3.018594" + }, + { + "duration_time": "0.023220", + "pts_time": "3.041814" + }, + { + "duration_time": "0.023220", + "pts_time": "3.065034" + }, + { + "duration_time": "0.023220", + "pts_time": "3.088254" + }, + { + "duration_time": "0.023220", + "pts_time": "3.111474" + }, + { + "duration_time": "0.023220", + "pts_time": "3.134694" + }, + { + "duration_time": "0.023220", + "pts_time": "3.157914" + }, + { + "duration_time": "0.023220", + "pts_time": "3.181134" + }, + { + "duration_time": "0.023220", + "pts_time": "3.204354" + }, + { + "duration_time": "0.023220", + "pts_time": "3.227574" + }, + { + "duration_time": "0.023220", + "pts_time": "3.250794" + }, + { + "duration_time": "0.023220", + "pts_time": "3.274014" + }, + { + "duration_time": "0.023220", + "pts_time": "3.297234" + }, + { + "duration_time": "0.023220", + "pts_time": "3.320454" + }, + { + "duration_time": "0.023220", + "pts_time": "3.343673" + }, + { + "duration_time": "0.023220", + "pts_time": "3.366893" + }, + { + "duration_time": "0.023220", + "pts_time": "3.390113" + }, + { + "duration_time": "0.023220", + "pts_time": "3.413333" + }, + { + "duration_time": "0.023220", + "pts_time": "3.436553" + }, + { + "duration_time": "0.023220", + "pts_time": "3.459773" + }, + { + "duration_time": "0.023220", + "pts_time": "3.482993" + }, + { + "duration_time": "0.023220", + "pts_time": "3.506213" + }, + { + "duration_time": "0.023220", + "pts_time": "3.529433" + }, + { + "duration_time": "0.023220", + "pts_time": "3.552653" + }, + { + "duration_time": "0.023220", + "pts_time": "3.575873" + }, + { + "duration_time": "0.023220", + "pts_time": "3.599093" + }, + { + "duration_time": "0.023220", + "pts_time": "3.622313" + }, + { + "duration_time": "0.023220", + "pts_time": "3.645533" + }, + { + "duration_time": "0.023220", + "pts_time": "3.668753" + }, + { + "duration_time": "0.023220", + "pts_time": "3.691973" + }, + { + "duration_time": "0.023220", + "pts_time": "3.715193" + }, + { + "duration_time": "0.023220", + "pts_time": "3.738413" + }, + { + "duration_time": "0.023220", + "pts_time": "3.761633" + }, + { + "duration_time": "0.023220", + "pts_time": "3.784853" + }, + { + "duration_time": "0.023220", + "pts_time": "3.808073" + }, + { + "duration_time": "0.023220", + "pts_time": "3.831293" + }, + { + "duration_time": "0.023220", + "pts_time": "3.854512" + }, + { + "duration_time": "0.023220", + "pts_time": "3.877732" + }, + { + "duration_time": "0.023220", + "pts_time": "3.900952" + }, + { + "duration_time": "0.023220", + "pts_time": "3.924172" + }, + { + "duration_time": "0.023220", + "pts_time": "3.947392" + }, + { + "duration_time": "0.023220", + "pts_time": "3.970612" + }, + { + "duration_time": "0.006168", + "pts_time": "3.993832" + } +] diff --git a/test/resources/sine_mono_s32_8000.wav b/test/resources/sine_mono_s32_8000.wav new file mode 100644 index 00000000..ccc5e671 Binary files /dev/null and b/test/resources/sine_mono_s32_8000.wav differ diff --git a/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json new file mode 100644 index 00000000..689def1e --- /dev/null +++ b/test/resources/sine_mono_s32_8000.wav.stream0.all_frames_info.json @@ -0,0 +1,130 @@ +[ + { + "duration_time": "0.128000", + "pts_time": "0.000000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.128000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.256000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.384000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.512000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.640000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.768000" + }, + { + "duration_time": "0.128000", + "pts_time": "0.896000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.024000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.152000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.280000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.408000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.536000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.664000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.792000" + }, + { + "duration_time": "0.128000", + "pts_time": "1.920000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.048000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.176000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.304000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.432000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.560000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.688000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.816000" + }, + { + "duration_time": "0.128000", + "pts_time": "2.944000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.072000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.200000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.328000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.456000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.584000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.712000" + }, + { + "duration_time": "0.128000", + "pts_time": "3.840000" + }, + { + "duration_time": "0.032000", + "pts_time": "3.968000" + } +] diff --git a/test/utils.py b/test/utils.py index efee0deb..d1cdd47d 100644 --- a/test/utils.py +++ b/test/utils.py @@ -462,6 +462,39 @@ def sample_format(self) -> str: }, ) +# This file is an upsampled version of SINE_MONO_S32, generated with: +# ffmpeg -i test/resources/sine_mono_s32.wav -ar 44100 -c:a pcm_s32le test/resources/sine_mono_s32_44100.wav +SINE_MONO_S32_44100 = TestAudio( + filename="sine_mono_s32_44100.wav", + default_stream_index=0, + frames={}, # Automatically loaded from json file + stream_infos={ + 0: TestAudioStreamInfo( + sample_rate=44_100, + num_channels=1, + duration_seconds=4, + num_frames=173, + sample_format="s32", + ) + }, +) + +# This file is a downsampled version of SINE_MONO_S32, generated with: +# ffmpeg -i test/resources/sine_mono_s32.wav -ar 8000 -c:a pcm_s32le test/resources/sine_mono_s32_8000.wav +SINE_MONO_S32_8000 = TestAudio( + filename="sine_mono_s32_8000.wav", + default_stream_index=0, + frames={}, # Automatically loaded from json file + stream_infos={ + 0: TestAudioStreamInfo( + sample_rate=8000, + num_channels=1, + duration_seconds=4, + num_frames=32, + sample_format="s32", + ) + }, +) # Same sample rate as SINE_MONO_S32, but encoded as s16 instead of s32. Generated with: # ffmpeg -i test/resources/sine_mono_s32.wav -ar 16000 -c:a pcm_s16le test/resources/sine_mono_s16.wav