diff --git a/Cargo.toml b/Cargo.toml index 7de4ccce74..cddc892753 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,6 @@ binaries = [ "fern", "console", "av-metrics", - "nom", ] default = ["binaries", "asm", "threading", "signal_support"] asm = ["nasm-rs", "cc", "regex"] @@ -100,11 +99,18 @@ wasm-bindgen = { version = "0.2.63", optional = true } rust_hawktracer = "0.7.0" arrayref = "0.3.6" const_fn_assert = "0.1.2" -nom = { version = "7.0.0", optional = true } +# `unreachable!` macro which panics in debug mode +# and optimizes away in release mode new_debug_unreachable = "1.0.4" once_cell = "1.13.0" av1-grain = { version = "0.2.0", features = ["serialize"] } serde-big-array = { version = "0.4.1", optional = true } +# Used for parsing film grain table files +nom = "7.0.0" +# Used as a data holder during denoising +ndarray = "0.15.4" +# Used for running FFTs during denoising +ndrustfft = "0.3.0" [dependencies.image] version = "0.24.3" diff --git a/clippy.toml b/clippy.toml index f26dd286a7..2b357613e4 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,4 +1,5 @@ too-many-arguments-threshold = 16 cognitive-complexity-threshold = 40 trivial-copy-size-limit = 16 # 128-bits = 2 64-bit registers +doc-valid-idents = ["DFTTest"] msrv = "1.59" diff --git a/src/api/config/encoder.rs b/src/api/config/encoder.rs index 7f84d5a081..c36bbda0f7 100644 --- a/src/api/config/encoder.rs +++ b/src/api/config/encoder.rs @@ -85,6 +85,8 @@ pub struct EncoderConfig { pub tune: Tune, /// Parameters for grain synthesis. pub film_grain_params: Option>, + /// Strength of denoising, 0 = disabled + pub denoise_strength: u8, /// Number of tiles horizontally. Must be a power of two. /// /// Overridden by [`tiles`], if present. @@ -159,6 +161,7 @@ impl EncoderConfig { bitrate: 0, tune: Tune::default(), film_grain_params: None, + denoise_strength: 0, tile_cols: 0, tile_rows: 0, tiles: 0, diff --git a/src/api/internal.rs b/src/api/internal.rs index 169a9c96ed..e9d0743c5b 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -15,6 +15,7 @@ use crate::api::{ }; use crate::color::ChromaSampling::Cs400; use crate::cpu_features::CpuFeatureLevel; +use crate::denoise::{DftDenoiser, TB_MIDPOINT}; use crate::dist::get_satd; use crate::encoder::*; use crate::frame::*; @@ -220,7 +221,7 @@ impl FrameData { } } -type FrameQueue = BTreeMap>>>; +pub(crate) type FrameQueue = BTreeMap>>>; type FrameDataQueue = BTreeMap>>; // the fields pub(super) are accessed only by the tests @@ -248,6 +249,7 @@ pub(crate) struct ContextInner { /// Maps `output_frameno` to `gop_input_frameno_start`. pub(crate) gop_input_frameno_start: BTreeMap, keyframe_detector: SceneChangeDetector, + denoiser: Option>, pub(crate) config: Arc, seq: Arc, pub(crate) rc_state: RCState, @@ -295,6 +297,17 @@ impl ContextInner { lookahead_distance, seq.clone(), ), + denoiser: if enc.denoise_strength > 0 { + Some(DftDenoiser::new( + enc.denoise_strength as f32 / 10.0, + enc.width, + enc.height, + enc.bit_depth as u8, + enc.chroma_sampling, + )) + } else { + None + }, config: Arc::new(enc.clone()), seq, rc_state: RCState::new( @@ -359,6 +372,25 @@ impl ContextInner { self.t35_q.insert(input_frameno, params.t35_metadata); } + // If denoising is enabled, run it now because we want the entire + // encoding process, including lookahead, to see the denoised frame. + if let Some(ref mut denoiser) = self.denoiser { + loop { + let denoiser_frame = denoiser.cur_frameno; + if (!is_flushing + && input_frameno >= denoiser_frame + TB_MIDPOINT as u64) + || (is_flushing && Some(denoiser_frame) < self.limit) + { + self.frame_q.insert( + denoiser_frame, + Some(Arc::new(denoiser.filter_frame(&self.frame_q).unwrap())), + ); + } else { + break; + } + } + } + if !self.needs_more_frame_q_lookahead(self.next_lookahead_frame) { let lookahead_frames = self .frame_q diff --git a/src/api/test.rs b/src/api/test.rs index 072562631f..3d167fdfc5 100644 --- a/src/api/test.rs +++ b/src/api/test.rs @@ -2131,6 +2131,7 @@ fn log_q_exp_overflow() { tile_cols: 0, tile_rows: 0, tiles: 0, + denoise_strength: 0, speed_settings: SpeedSettings { multiref: false, fast_deblock: true, @@ -2207,6 +2208,7 @@ fn guess_frame_subtypes_assert() { tile_cols: 0, tile_rows: 0, tiles: 0, + denoise_strength: 0, speed_settings: SpeedSettings { multiref: false, fast_deblock: true, diff --git a/src/bin/common.rs b/src/bin/common.rs index 18cb05d388..7b21a2ef6b 100644 --- a/src/bin/common.rs +++ b/src/bin/common.rs @@ -176,7 +176,6 @@ pub struct CliOptions { pub still_picture: bool, /// Uses grain synthesis to add photon noise to the resulting encode. /// Takes a strength value 0-64. - #[cfg(feature = "unstable")] #[clap( long, conflicts_with = "film-grain-table", @@ -185,6 +184,17 @@ pub struct CliOptions { help_heading = "ENCODE SETTINGS" )] pub photon_noise: u8, + /// Enable spatio-temporal denoising, intended to be used with grain synthesis. + /// Takes a strength value 0-50. + /// + /// Default strength is 1/2 of photon noise strength, + /// or 4 if a photon noise table is specified. + #[clap( + long, + value_parser = clap::value_parser!(u8).range(0..=50), + help_heading = "ENCODE SETTINGS" + )] + pub denoise: Option, /// Uses a film grain table file to apply grain synthesis to the encode. /// Uses the same table file format as aomenc and svt-av1. #[clap( @@ -334,7 +344,6 @@ pub struct ParsedCliOptions { pub save_config: Option, #[cfg(feature = "unstable")] pub slots: usize, - #[cfg(feature = "unstable")] pub generate_grain_strength: u8, } @@ -482,7 +491,6 @@ pub fn parse_cli() -> Result { save_config: save_config_path, #[cfg(feature = "unstable")] slots, - #[cfg(feature = "unstable")] generate_grain_strength: matches.photon_noise, }) } @@ -676,7 +684,19 @@ fn parse_config(matches: &CliOptions) -> Result { .expect("Failed to parse film grain table"); if !table.is_empty() { cfg.film_grain_params = Some(table); + cfg.denoise_strength = 4; + } + } else if matches.photon_noise > 0 { + cfg.denoise_strength = matches.photon_noise / 2; + // We have to know the video resolution before we can generate a table, + // so we must handle that elsewhere. + } + // A user set denoise strength overrides the defaults above + if let Some(denoise_str) = matches.denoise { + if denoise_str > 50 { + panic!("Denoising strength must be between 0-50"); } + cfg.denoise_strength = denoise_str; } if let Some(frame_rate) = matches.frame_rate { diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs index e0221fbdd9..71d559995b 100644 --- a/src/bin/rav1e-ch.rs +++ b/src/bin/rav1e-ch.rs @@ -473,7 +473,6 @@ fn run() -> Result<(), error::CliError> { cli.enc.time_base = video_info.time_base; } - #[cfg(feature = "unstable")] if cli.generate_grain_strength > 0 && cli.enc.film_grain_params.is_none() { cli.enc.film_grain_params = Some(vec![generate_photon_noise_params( 0, diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs index 8d672736d8..2390ffb0bb 100644 --- a/src/bin/rav1e.rs +++ b/src/bin/rav1e.rs @@ -456,7 +456,6 @@ fn run() -> Result<(), error::CliError> { cli.enc.time_base = video_info.time_base; } - #[cfg(feature = "unstable")] if cli.generate_grain_strength > 0 && cli.enc.film_grain_params.is_none() { cli.enc.film_grain_params = Some(vec![generate_photon_noise_params( 0, diff --git a/src/denoise.rs b/src/denoise.rs new file mode 100644 index 0000000000..93cfb499ae --- /dev/null +++ b/src/denoise.rs @@ -0,0 +1,587 @@ +use crate::api::FrameQueue; +use crate::util::Aligned; +use crate::EncoderStatus; +use arrayvec::ArrayVec; +use ndarray::{Array3, ArrayView3, ArrayViewMut3}; +use ndrustfft::{ + ndfft, ndfft_r2c, ndifft, ndifft_r2c, Complex, FftHandler, R2cFftHandler, +}; +use std::collections::{BTreeMap, VecDeque}; +use std::f64::consts::PI; +use std::iter::once; +use std::mem::size_of; +use std::ptr::copy_nonoverlapping; +use std::sync::Arc; +use v_frame::frame::Frame; +use v_frame::math::clamp; +use v_frame::pixel::{CastFromPrimitive, ChromaSampling, Pixel}; +use v_frame::plane::Plane; + +const SB_SIZE: usize = 16; +const SO_SIZE: usize = 12; +const TB_SIZE: usize = 3; +pub(crate) const TB_MIDPOINT: usize = TB_SIZE / 2; +const BLOCK_AREA: usize = SB_SIZE * SB_SIZE; +const BLOCK_VOLUME: usize = BLOCK_AREA * TB_SIZE; +const COMPLEX_COUNT: usize = (SB_SIZE / 2 + 1) * SB_SIZE * TB_SIZE; +const CCNT2: usize = COMPLEX_COUNT * 2; +const INC: usize = SB_SIZE - SO_SIZE; + +/// This denoiser is based on the DFTTest plugin from Vapoursynth. +/// This type of denoising was chosen because it provides +/// high quality while not being too slow. +pub(crate) struct DftDenoiser +where + T: Pixel, +{ + chroma_sampling: ChromaSampling, + dest_scale: f32, + src_scale: f32, + peak: T, + + // These indices refer to planes of the input + pad_dimensions: ArrayVec<(usize, usize), 3>, + effective_heights: ArrayVec, + + hw: Aligned<[f32; BLOCK_VOLUME]>, + dftgc: Aligned<[Complex; COMPLEX_COUNT]>, + fft: (R2cFftHandler, FftHandler, FftHandler), + sigmas: Aligned<[f32; CCNT2]>, + + // This stores a copy of the unfiltered previous frame, + // since in `frame_q` it will be filtered already. + // We only have one frame, but it's left as a Vec so that + // TB_SIZE could potentially be tweaked without any + // code changes. + frame_buffer: VecDeque>>, + pub(crate) cur_frameno: u64, +} + +impl DftDenoiser +where + T: Pixel, +{ + // This should only need to run once per video. + pub fn new( + sigma: f32, width: usize, height: usize, bit_depth: u8, + chroma_sampling: ChromaSampling, + ) -> Self { + if size_of::() == 1 { + assert!(bit_depth <= 8); + } else { + assert!(bit_depth > 8); + } + + let dest_scale = (1 << (bit_depth - 8)) as f32; + let src_scale = 1.0 / dest_scale; + let peak = T::cast_from((1u16 << bit_depth) - 1); + + let mut pad_dimensions = ArrayVec::<_, 3>::new(); + let mut effective_heights = ArrayVec::<_, 3>::new(); + for plane in 0..3 { + let ae = (SB_SIZE - SO_SIZE).max(SO_SIZE) * 2; + let (width, height) = if plane == 0 { + (width, height) + } else { + chroma_sampling.get_chroma_dimensions(width, height) + }; + let pad_w = width + extra(width, SB_SIZE) + ae; + let pad_h = height + extra(height, SB_SIZE) + ae; + let e_h = + ((pad_h - SO_SIZE) / (SB_SIZE - SO_SIZE)) * (SB_SIZE - SO_SIZE); + pad_dimensions.push((pad_w, pad_h)); + effective_heights.push(e_h); + } + + let hw = Aligned::new(Self::create_window()); + let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]); + + let fft = ( + R2cFftHandler::new(SB_SIZE), + FftHandler::new(SB_SIZE), + FftHandler::new(TB_SIZE), + ); + + let mut wscale = 0.0f32; + for k in 0..BLOCK_VOLUME { + dftgr[k] = 255.0 * hw[k]; + wscale += hw[k].powi(2); + } + let wscale = 1.0 / wscale; + + let mut sigmas = Aligned::new([0f32; CCNT2]); + sigmas.fill(sigma / wscale); + + let mut denoiser = DftDenoiser { + chroma_sampling, + dest_scale, + src_scale, + peak, + pad_dimensions, + effective_heights, + hw, + fft, + sigmas, + dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]), + frame_buffer: VecDeque::with_capacity(TB_MIDPOINT), + cur_frameno: 0, + }; + + let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]); + denoiser.real_to_complex_3d(&dftgr, &mut dftgc); + denoiser.dftgc = dftgc; + + denoiser + } + + pub fn filter_frame( + &mut self, frame_q: &FrameQueue, + ) -> Result, EncoderStatus> { + if self.frame_buffer.len() < TB_MIDPOINT.min(self.cur_frameno as usize) { + // We need to have the previous unfiltered frame + // in the buffer for temporal filtering. + return Err(EncoderStatus::NeedMoreData); + } + let future_frames = frame_q + .range((self.cur_frameno + 1)..) + .take(TB_MIDPOINT) + .map(|(_, f)| f) + .collect::>(); + if future_frames.len() != TB_MIDPOINT + && !future_frames.iter().any(|f| f.is_none()) + { + // We also need to have the next unfiltered frame, + // unless we are at the end of the video. + return Err(EncoderStatus::NeedMoreData); + } + + let orig_frame = frame_q.get(&self.cur_frameno).unwrap().as_ref().unwrap(); + let frames = self + .frame_buffer + .iter() + .cloned() + .enumerate() + .chain(once(((TB_MIDPOINT), Arc::clone(orig_frame)))) + .chain( + future_frames + .into_iter() + .flatten() + .cloned() + .enumerate() + .map(|(i, f)| (i + 1 + TB_MIDPOINT, f)), + ) + .collect::>(); + + let mut dest = (**orig_frame).clone(); + let mut pad = ArrayVec::<_, TB_SIZE>::new(); + for i in 0..TB_SIZE { + let dec = self.chroma_sampling.get_decimation().unwrap_or((0, 0)); + let mut pad_frame = [ + Plane::new( + self.pad_dimensions[0].0, + self.pad_dimensions[0].1, + 0, + 0, + 0, + 0, + ), + Plane::new( + self.pad_dimensions[1].0, + self.pad_dimensions[1].1, + dec.0, + dec.1, + 0, + 0, + ), + Plane::new( + self.pad_dimensions[2].0, + self.pad_dimensions[2].1, + dec.0, + dec.1, + 0, + 0, + ), + ]; + + let frame = frames.get(&i).unwrap_or(&frames[&TB_MIDPOINT]); + self.copy_pad(frame, &mut pad_frame); + pad.push(pad_frame); + } + self.do_filtering(&pad, &mut dest); + + if self.frame_buffer.len() == TB_MIDPOINT { + self.frame_buffer.pop_front(); + } + self.frame_buffer.push_back(Arc::clone(orig_frame)); + self.cur_frameno += 1; + + Ok(dest) + } + + fn do_filtering(&mut self, src: &[[Plane; 3]], dest: &mut Frame) { + let mut dftr = [0f32; BLOCK_VOLUME]; + let mut dftc = [Complex::::default(); COMPLEX_COUNT]; + let mut means = [Complex::::default(); COMPLEX_COUNT]; + + for p in 0..3 { + let (pad_width, pad_height) = self.pad_dimensions[p]; + let mut ebuff = vec![0f32; pad_width * pad_height]; + let effective_height = self.effective_heights[p]; + let src_stride = src[0][p].cfg.stride; + let ebuff_stride = pad_width; + + let mut src_planes = src + .iter() + .map(|f| f[p].data_origin()) + .collect::>(); + + // SAFETY: We know the size of the planes we're working on, + // so we can safely ensure we are not out of bounds. + // There are a fair number of unsafe function calls here + // which are unsafe for optimization purposes. + // All are safe as long as we do not pass out-of-bounds parameters. + unsafe { + for y in (0..effective_height).step_by(INC) { + for x in (0..=(pad_width - SB_SIZE)).step_by(INC) { + for z in 0..TB_SIZE { + self.proc0( + &src_planes[z][x..], + &self.hw[(BLOCK_AREA * z)..], + &mut dftr[(BLOCK_AREA * z)..], + src_stride, + SB_SIZE, + self.src_scale, + ); + } + + self.real_to_complex_3d(&dftr, &mut dftc); + self.remove_mean(&mut dftc, &self.dftgc, &mut means); + + self.filter_coeffs(&mut dftc); + + self.add_mean(&mut dftc, &means); + self.complex_to_real_3d(&dftc, &mut dftr); + + self.proc1( + &dftr[(TB_MIDPOINT * BLOCK_AREA)..], + &self.hw[(TB_MIDPOINT * BLOCK_AREA)..], + &mut ebuff[(y * ebuff_stride + x)..], + SB_SIZE, + ebuff_stride, + ); + } + + for q in 0..TB_SIZE { + src_planes[q] = &src_planes[q][(INC * src_stride)..]; + } + } + } + + let dest_width = dest.planes[p].cfg.width; + let dest_height = dest.planes[p].cfg.height; + let dest_stride = dest.planes[p].cfg.stride; + let dest_plane = dest.planes[p].data_origin_mut(); + let ebp_offset = ebuff_stride * ((pad_height - dest_height) / 2) + + (pad_width - dest_width) / 2; + let ebp = &ebuff[ebp_offset..]; + + self.cast( + ebp, + dest_plane, + dest_width, + dest_height, + dest_stride, + ebuff_stride, + ); + } + } + + fn create_window() -> [f32; BLOCK_VOLUME] { + let mut hw = [0f32; BLOCK_VOLUME]; + let mut tw = [0f64; TB_SIZE]; + let mut sw = [0f64; SB_SIZE]; + + tw.fill_with(Self::temporal_window); + sw.iter_mut().enumerate().for_each(|(j, sw)| { + *sw = Self::spatial_window(j as f64 + 0.5); + }); + Self::normalize_for_overlap_add(&mut sw); + + let nscale = 1.0 / (BLOCK_VOLUME as f64).sqrt(); + for j in 0..TB_SIZE { + for k in 0..SB_SIZE { + for q in 0..SB_SIZE { + hw[(j * SB_SIZE + k) * SB_SIZE + q] = + (tw[j] * sw[k] * sw[q] * nscale) as f32; + } + } + } + + hw + } + + #[inline(always)] + // Hanning windowing + fn spatial_window(n: f64) -> f64 { + 0.5 - 0.5 * (2.0 * PI * n / SB_SIZE as f64).cos() + } + + #[inline(always)] + // Simple rectangular windowing + fn temporal_window() -> f64 { + 1.0 + } + + // Accounts for spatial block overlap + fn normalize_for_overlap_add(hw: &mut [f64]) { + let inc = SB_SIZE - SO_SIZE; + + let mut nw = [0f64; SB_SIZE]; + let hw = &mut hw[..SB_SIZE]; + + for q in 0..SB_SIZE { + for h in (0..=q).rev().step_by(inc) { + nw[q] += hw[h].powi(2); + } + for h in ((q + inc)..SB_SIZE).step_by(inc) { + nw[q] += hw[h].powi(2); + } + } + + for q in 0..SB_SIZE { + hw[q] /= nw[q].sqrt(); + } + } + + #[inline] + unsafe fn proc0( + &self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize, + src_scale: f32, + ) { + let s0 = s0.as_ptr(); + let s1 = s1.as_ptr(); + let dest = dest.as_mut_ptr(); + + for u in 0..p1 { + for v in 0..p1 { + let s0 = s0.add(u * p0 + v); + let s1 = s1.add(u * p1 + v); + let dest = dest.add(u * p1 + v); + dest.write(u16::cast_from(s0.read()) as f32 * src_scale * s1.read()) + } + } + } + + #[inline] + unsafe fn proc1( + &self, s0: &[f32], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize, + ) { + let s0 = s0.as_ptr(); + let s1 = s1.as_ptr(); + let dest = dest.as_mut_ptr(); + + for u in 0..p0 { + for v in 0..p0 { + let s0 = s0.add(u * p0 + v); + let s1 = s1.add(u * p0 + v); + let dest = dest.add(u * p1 + v); + dest.write(s0.read().mul_add(s1.read(), dest.read())); + } + } + } + + #[inline] + fn remove_mean( + &self, dftc: &mut [Complex; COMPLEX_COUNT], + dftgc: &[Complex; COMPLEX_COUNT], + means: &mut [Complex; COMPLEX_COUNT], + ) { + let gf = dftc[0].re / dftgc[0].re; + + for h in 0..COMPLEX_COUNT { + means[h].re = gf * dftgc[h].re; + means[h].im = gf * dftgc[h].im; + dftc[h].re -= means[h].re; + dftc[h].im -= means[h].im; + } + } + + #[inline] + fn add_mean( + &self, dftc: &mut [Complex; COMPLEX_COUNT], + means: &[Complex; COMPLEX_COUNT], + ) { + for h in 0..COMPLEX_COUNT { + dftc[h].re += means[h].re; + dftc[h].im += means[h].im; + } + } + + #[inline] + // Applies a generalized wiener filter + fn filter_coeffs(&self, dftc: &mut [Complex; COMPLEX_COUNT]) { + for h in 0..COMPLEX_COUNT { + let psd = dftc[h].re.mul_add(dftc[h].re, dftc[h].im.powi(2)); + let mult = ((psd - self.sigmas[h]) / (psd + 1e-15)).max(0.0); + dftc[h].re *= mult; + dftc[h].im *= mult; + } + } + + fn copy_pad(&self, src: &Frame, dest: &mut [Plane; 3]) { + for p in 0..src.planes.len() { + let src_width = src.planes[p].cfg.width; + let dest_width = dest[p].cfg.width; + let src_height = src.planes[p].cfg.height; + let dest_height = dest[p].cfg.height; + let src_stride = src.planes[p].cfg.stride; + let dest_stride = dest[p].cfg.stride; + + let offy = (dest_height - src_height) / 2; + let offx = (dest_width - src_width) / 2; + + bitblt( + &mut dest[p].data_origin_mut()[(dest_stride * offy + offx)..], + dest_stride, + src.planes[p].data_origin(), + src_stride, + src_width, + src_height, + ); + + let mut dest_ptr = + &mut dest[p].data_origin_mut()[(dest_stride * offy)..]; + for _ in offy..(src_height + offy) { + let dest_slice = &mut dest_ptr[..dest_width]; + + let mut w = offx * 2; + for x in 0..offx { + dest_slice[x] = dest_slice[w]; + w -= 1; + } + + w = offx + src_width - 2; + for x in (offx + src_width)..dest_width { + dest_slice[x] = dest_slice[w]; + w -= 1; + } + + dest_ptr = &mut dest_ptr[dest_stride..]; + } + + let dest_origin = dest[p].data_origin_mut(); + let mut w = offy * 2; + for y in 0..offy { + // SAFETY: `copy_from_slice` has borrow checker issues here + // because we are copying from `dest` to `dest`, but we manually + // know that the two slices will not overlap. We still slice + // the start and end as a safety check. + unsafe { + copy_nonoverlapping( + dest_origin[(dest_stride * w)..][..dest_width].as_ptr(), + dest_origin[(dest_stride * y)..][..dest_width].as_mut_ptr(), + dest_width, + ); + } + w -= 1; + } + + w = offy + src_height - 2; + for y in (offy + src_height)..dest_height { + // SAFETY: `copy_from_slice` has borrow checker issues here + // because we are copying from `dest` to `dest`, but we manually + // know that the two slices will not overlap. We still slice + // the start and end as a safety check. + unsafe { + copy_nonoverlapping( + dest_origin[(dest_stride * w)..][..dest_width].as_ptr(), + dest_origin[(dest_stride * y)..][..dest_width].as_mut_ptr(), + dest_width, + ); + } + w -= 1; + } + } + } + + fn cast( + &self, ebuff: &[f32], dest: &mut [T], dest_width: usize, + dest_height: usize, dest_stride: usize, ebp_stride: usize, + ) { + let ebuff = ebuff.chunks(ebp_stride); + let dest = dest.chunks_mut(dest_stride); + + for (ebuff, dest) in ebuff.zip(dest).take(dest_height) { + for x in 0..dest_width { + let fval = ebuff[x].mul_add(self.dest_scale, 0.5); + dest[x] = + clamp(T::cast_from(fval as u16), T::cast_from(0u16), self.peak); + } + } + } + + // Applies a real-to-complex 3-dimensional FFT to `real` + fn real_to_complex_3d( + &mut self, real: &[f32; BLOCK_VOLUME], + output: &mut [Complex; COMPLEX_COUNT], + ) { + let input = + ArrayView3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE), real).unwrap(); + let mut temp1 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1)); + let mut temp2 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1)); + let mut output = + ArrayViewMut3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1), output) + .unwrap(); + + ndfft_r2c(&input, &mut temp1, &mut self.fft.0, 2); + ndfft(&temp1, &mut temp2, &mut self.fft.1, 1); + ndfft(&temp2, &mut output, &mut self.fft.2, 0); + } + + // Applies a complex-to-real 3-dimensional FFT to `complex` + fn complex_to_real_3d( + &mut self, complex: &[Complex; COMPLEX_COUNT], + output: &mut [f32; BLOCK_VOLUME], + ) { + let input = + ArrayView3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1), complex) + .unwrap(); + let mut temp0 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1)); + let mut temp1 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1)); + let mut output = + ArrayViewMut3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE), output).unwrap(); + + ndifft(&input, &mut temp0, &mut self.fft.2, 0); + ndifft(&temp0, &mut temp1, &mut self.fft.1, 1); + ndifft_r2c(&temp1, &mut output, &mut self.fft.0, 2); + output.iter_mut().for_each(|d| { + *d *= BLOCK_VOLUME as f32; + }); + } +} + +#[inline(always)] +fn extra(a: usize, b: usize) -> usize { + if a % b > 0 { + b - (a % b) + } else { + 0 + } +} + +// Identical to Vapoursynth's implementation `vs_bitblt` +// which basically copies the pixels in a plane. +fn bitblt( + mut dest: &mut [T], dest_stride: usize, mut src: &[T], src_stride: usize, + width: usize, height: usize, +) { + if src_stride == dest_stride && src_stride == width { + dest[..(width * height)].copy_from_slice(&src[..(width * height)]); + } else { + for _ in 0..height { + dest[..width].copy_from_slice(&src[..width]); + src = &src[src_stride..]; + dest = &mut dest[dest_stride..]; + } + } +} diff --git a/src/fuzzing.rs b/src/fuzzing.rs index aab9abe059..d440f3f88f 100644 --- a/src/fuzzing.rs +++ b/src/fuzzing.rs @@ -257,6 +257,7 @@ impl Arbitrary for ArbitraryEncoder { switch_frame_interval: u.int_in_range(0..=3)?, tune: *u.choose(&[Tune::Psnr, Tune::Psychovisual])?, film_grain_params: None, + denoise_strength: u.int_in_range(0..=50)?, }; let frame_count = diff --git a/src/lib.rs b/src/lib.rs index 3425588db4..05e2270115 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -257,6 +257,7 @@ mod cdef; #[doc(hidden)] pub mod context; mod deblock; +mod denoise; mod encoder; mod entropymode; mod lrf; diff --git a/src/test_encode_decode/mod.rs b/src/test_encode_decode/mod.rs index 9e6082ed55..8bdecca838 100644 --- a/src/test_encode_decode/mod.rs +++ b/src/test_encode_decode/mod.rs @@ -10,9 +10,8 @@ // Fuzzing only uses a subset of these. #![cfg_attr(fuzzing, allow(unused))] -use crate::color::ChromaSampling; - use crate::api::config::GrainTableSegment; +use crate::color::ChromaSampling; use crate::util::Pixel; use crate::*; diff --git a/src/util/align.rs b/src/util/align.rs index c86424e8b2..02928698cd 100644 --- a/src/util/align.rs +++ b/src/util/align.rs @@ -42,6 +42,20 @@ impl Aligned { } } +impl std::ops::Deref for Aligned { + type Target = T; + + fn deref(&self) -> &T { + &self.data + } +} + +impl std::ops::DerefMut for Aligned { + fn deref_mut(&mut self) -> &mut T { + &mut self.data + } +} + /// An analog to a Box<[T]> where the underlying slice is aligned. /// Alignment is according to the architecture-specific SIMD constraints. pub struct AlignedBoxedSlice {