diff --git a/Cargo.toml b/Cargo.toml
index 7de4ccce74..cddc892753 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ binaries = [
     "fern",
     "console",
     "av-metrics",
-    "nom",
 ]
 default = ["binaries", "asm", "threading", "signal_support"]
 asm = ["nasm-rs", "cc", "regex"]
@@ -100,11 +99,18 @@ wasm-bindgen = { version = "0.2.63", optional = true }
 rust_hawktracer = "0.7.0"
 arrayref = "0.3.6"
 const_fn_assert = "0.1.2"
-nom = { version = "7.0.0", optional = true }
+# `unreachable!` macro which panics in debug mode
+# and optimizes away in release mode
 new_debug_unreachable = "1.0.4"
 once_cell = "1.13.0"
 av1-grain = { version = "0.2.0", features = ["serialize"] }
 serde-big-array = { version = "0.4.1", optional = true }
+# Used for parsing film grain table files
+nom = "7.0.0"
+# Used as a data holder during denoising
+ndarray = "0.15.4"
+# Used for running FFTs during denoising
+ndrustfft = "0.3.0"
 
 [dependencies.image]
 version = "0.24.3"
diff --git a/clippy.toml b/clippy.toml
index f26dd286a7..2b357613e4 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -1,4 +1,5 @@
 too-many-arguments-threshold = 16
 cognitive-complexity-threshold = 40
 trivial-copy-size-limit = 16 # 128-bits = 2 64-bit registers
+doc-valid-idents = ["DFTTest"]
 msrv = "1.59"
diff --git a/src/api/config/encoder.rs b/src/api/config/encoder.rs
index 7f84d5a081..c36bbda0f7 100644
--- a/src/api/config/encoder.rs
+++ b/src/api/config/encoder.rs
@@ -85,6 +85,8 @@ pub struct EncoderConfig {
   pub tune: Tune,
   /// Parameters for grain synthesis.
   pub film_grain_params: Option<Vec<GrainTableSegment>>,
+  /// Strength of denoising, 0 = disabled
+  pub denoise_strength: u8,
   /// Number of tiles horizontally. Must be a power of two.
   ///
   /// Overridden by [`tiles`], if present.
@@ -159,6 +161,7 @@ impl EncoderConfig {
       bitrate: 0,
       tune: Tune::default(),
       film_grain_params: None,
+      denoise_strength: 0,
       tile_cols: 0,
       tile_rows: 0,
       tiles: 0,
diff --git a/src/api/internal.rs b/src/api/internal.rs
index 169a9c96ed..e9d0743c5b 100644
--- a/src/api/internal.rs
+++ b/src/api/internal.rs
@@ -15,6 +15,7 @@ use crate::api::{
 };
 use crate::color::ChromaSampling::Cs400;
 use crate::cpu_features::CpuFeatureLevel;
+use crate::denoise::{DftDenoiser, TB_MIDPOINT};
 use crate::dist::get_satd;
 use crate::encoder::*;
 use crate::frame::*;
@@ -220,7 +221,7 @@ impl<T: Pixel> FrameData<T> {
   }
 }
 
-type FrameQueue<T> = BTreeMap<u64, Option<Arc<Frame<T>>>>;
+pub(crate) type FrameQueue<T> = BTreeMap<u64, Option<Arc<Frame<T>>>>;
 type FrameDataQueue<T> = BTreeMap<u64, Option<FrameData<T>>>;
 
 // the fields pub(super) are accessed only by the tests
@@ -248,6 +249,7 @@ pub(crate) struct ContextInner<T: Pixel> {
   /// Maps `output_frameno` to `gop_input_frameno_start`.
   pub(crate) gop_input_frameno_start: BTreeMap<u64, u64>,
   keyframe_detector: SceneChangeDetector<T>,
+  denoiser: Option<DftDenoiser<T>>,
   pub(crate) config: Arc<EncoderConfig>,
   seq: Arc<Sequence>,
   pub(crate) rc_state: RCState,
@@ -295,6 +297,17 @@ impl<T: Pixel> ContextInner<T> {
         lookahead_distance,
         seq.clone(),
       ),
+      denoiser: if enc.denoise_strength > 0 {
+        Some(DftDenoiser::new(
+          enc.denoise_strength as f32 / 10.0,
+          enc.width,
+          enc.height,
+          enc.bit_depth as u8,
+          enc.chroma_sampling,
+        ))
+      } else {
+        None
+      },
       config: Arc::new(enc.clone()),
       seq,
       rc_state: RCState::new(
@@ -359,6 +372,25 @@ impl<T: Pixel> ContextInner<T> {
       self.t35_q.insert(input_frameno, params.t35_metadata);
     }
 
+    // If denoising is enabled, run it now because we want the entire
+    // encoding process, including lookahead, to see the denoised frame.
+    if let Some(ref mut denoiser) = self.denoiser {
+      loop {
+        let denoiser_frame = denoiser.cur_frameno;
+        if (!is_flushing
+          && input_frameno >= denoiser_frame + TB_MIDPOINT as u64)
+          || (is_flushing && Some(denoiser_frame) < self.limit)
+        {
+          self.frame_q.insert(
+            denoiser_frame,
+            Some(Arc::new(denoiser.filter_frame(&self.frame_q).unwrap())),
+          );
+        } else {
+          break;
+        }
+      }
+    }
+
     if !self.needs_more_frame_q_lookahead(self.next_lookahead_frame) {
       let lookahead_frames = self
         .frame_q
diff --git a/src/api/test.rs b/src/api/test.rs
index 072562631f..3d167fdfc5 100644
--- a/src/api/test.rs
+++ b/src/api/test.rs
@@ -2131,6 +2131,7 @@ fn log_q_exp_overflow() {
     tile_cols: 0,
     tile_rows: 0,
     tiles: 0,
+    denoise_strength: 0,
     speed_settings: SpeedSettings {
       multiref: false,
       fast_deblock: true,
@@ -2207,6 +2208,7 @@ fn guess_frame_subtypes_assert() {
     tile_cols: 0,
     tile_rows: 0,
     tiles: 0,
+    denoise_strength: 0,
     speed_settings: SpeedSettings {
       multiref: false,
       fast_deblock: true,
diff --git a/src/bin/common.rs b/src/bin/common.rs
index 18cb05d388..7b21a2ef6b 100644
--- a/src/bin/common.rs
+++ b/src/bin/common.rs
@@ -176,7 +176,6 @@ pub struct CliOptions {
   pub still_picture: bool,
   /// Uses grain synthesis to add photon noise to the resulting encode.
   /// Takes a strength value 0-64.
-  #[cfg(feature = "unstable")]
   #[clap(
     long,
     conflicts_with = "film-grain-table",
@@ -185,6 +184,17 @@ pub struct CliOptions {
     help_heading = "ENCODE SETTINGS"
   )]
   pub photon_noise: u8,
+  /// Enable spatio-temporal denoising, intended to be used with grain synthesis.
+  /// Takes a strength value 0-50.
+  ///
+  /// Default strength is 1/2 of photon noise strength,
+  /// or 4 if a photon noise table is specified.
+  #[clap(
+    long,
+    value_parser = clap::value_parser!(u8).range(0..=50),
+    help_heading = "ENCODE SETTINGS"
+  )]
+  pub denoise: Option<u8>,
   /// Uses a film grain table file to apply grain synthesis to the encode.
   /// Uses the same table file format as aomenc and svt-av1.
   #[clap(
@@ -334,7 +344,6 @@ pub struct ParsedCliOptions {
   pub save_config: Option<PathBuf>,
   #[cfg(feature = "unstable")]
   pub slots: usize,
-  #[cfg(feature = "unstable")]
   pub generate_grain_strength: u8,
 }
 
@@ -482,7 +491,6 @@ pub fn parse_cli() -> Result<ParsedCliOptions, CliError> {
     save_config: save_config_path,
     #[cfg(feature = "unstable")]
     slots,
-    #[cfg(feature = "unstable")]
     generate_grain_strength: matches.photon_noise,
   })
 }
@@ -676,7 +684,19 @@ fn parse_config(matches: &CliOptions) -> Result<EncoderConfig, CliError> {
       .expect("Failed to parse film grain table");
     if !table.is_empty() {
       cfg.film_grain_params = Some(table);
+      cfg.denoise_strength = 4;
+    }
+  } else if matches.photon_noise > 0 {
+    cfg.denoise_strength = matches.photon_noise / 2;
+    // We have to know the video resolution before we can generate a table,
+    // so we must handle that elsewhere.
+  }
+  // A user set denoise strength overrides the defaults above
+  if let Some(denoise_str) = matches.denoise {
+    if denoise_str > 50 {
+      panic!("Denoising strength must be between 0-50");
     }
+    cfg.denoise_strength = denoise_str;
   }
 
   if let Some(frame_rate) = matches.frame_rate {
diff --git a/src/bin/rav1e-ch.rs b/src/bin/rav1e-ch.rs
index e0221fbdd9..71d559995b 100644
--- a/src/bin/rav1e-ch.rs
+++ b/src/bin/rav1e-ch.rs
@@ -473,7 +473,6 @@ fn run() -> Result<(), error::CliError> {
     cli.enc.time_base = video_info.time_base;
   }
 
-  #[cfg(feature = "unstable")]
   if cli.generate_grain_strength > 0 && cli.enc.film_grain_params.is_none() {
     cli.enc.film_grain_params = Some(vec![generate_photon_noise_params(
       0,
diff --git a/src/bin/rav1e.rs b/src/bin/rav1e.rs
index 8d672736d8..2390ffb0bb 100644
--- a/src/bin/rav1e.rs
+++ b/src/bin/rav1e.rs
@@ -456,7 +456,6 @@ fn run() -> Result<(), error::CliError> {
     cli.enc.time_base = video_info.time_base;
   }
 
-  #[cfg(feature = "unstable")]
   if cli.generate_grain_strength > 0 && cli.enc.film_grain_params.is_none() {
     cli.enc.film_grain_params = Some(vec![generate_photon_noise_params(
       0,
diff --git a/src/denoise.rs b/src/denoise.rs
new file mode 100644
index 0000000000..93cfb499ae
--- /dev/null
+++ b/src/denoise.rs
@@ -0,0 +1,587 @@
+use crate::api::FrameQueue;
+use crate::util::Aligned;
+use crate::EncoderStatus;
+use arrayvec::ArrayVec;
+use ndarray::{Array3, ArrayView3, ArrayViewMut3};
+use ndrustfft::{
+  ndfft, ndfft_r2c, ndifft, ndifft_r2c, Complex, FftHandler, R2cFftHandler,
+};
+use std::collections::{BTreeMap, VecDeque};
+use std::f64::consts::PI;
+use std::iter::once;
+use std::mem::size_of;
+use std::ptr::copy_nonoverlapping;
+use std::sync::Arc;
+use v_frame::frame::Frame;
+use v_frame::math::clamp;
+use v_frame::pixel::{CastFromPrimitive, ChromaSampling, Pixel};
+use v_frame::plane::Plane;
+
+const SB_SIZE: usize = 16;
+const SO_SIZE: usize = 12;
+const TB_SIZE: usize = 3;
+pub(crate) const TB_MIDPOINT: usize = TB_SIZE / 2;
+const BLOCK_AREA: usize = SB_SIZE * SB_SIZE;
+const BLOCK_VOLUME: usize = BLOCK_AREA * TB_SIZE;
+const COMPLEX_COUNT: usize = (SB_SIZE / 2 + 1) * SB_SIZE * TB_SIZE;
+const CCNT2: usize = COMPLEX_COUNT * 2;
+const INC: usize = SB_SIZE - SO_SIZE;
+
+/// This denoiser is based on the DFTTest plugin from Vapoursynth.
+/// This type of denoising was chosen because it provides
+/// high quality while not being too slow.
+pub(crate) struct DftDenoiser<T>
+where
+  T: Pixel,
+{
+  chroma_sampling: ChromaSampling,
+  dest_scale: f32,
+  src_scale: f32,
+  peak: T,
+
+  // These indices refer to planes of the input
+  pad_dimensions: ArrayVec<(usize, usize), 3>,
+  effective_heights: ArrayVec<usize, 3>,
+
+  hw: Aligned<[f32; BLOCK_VOLUME]>,
+  dftgc: Aligned<[Complex<f32>; COMPLEX_COUNT]>,
+  fft: (R2cFftHandler<f32>, FftHandler<f32>, FftHandler<f32>),
+  sigmas: Aligned<[f32; CCNT2]>,
+
+  // This stores a copy of the unfiltered previous frame,
+  // since in `frame_q` it will be filtered already.
+  // We only have one frame, but it's left as a Vec so that
+  // TB_SIZE could potentially be tweaked without any
+  // code changes.
+  frame_buffer: VecDeque<Arc<Frame<T>>>,
+  pub(crate) cur_frameno: u64,
+}
+
+impl<T> DftDenoiser<T>
+where
+  T: Pixel,
+{
+  // This should only need to run once per video.
+  pub fn new(
+    sigma: f32, width: usize, height: usize, bit_depth: u8,
+    chroma_sampling: ChromaSampling,
+  ) -> Self {
+    if size_of::<T>() == 1 {
+      assert!(bit_depth <= 8);
+    } else {
+      assert!(bit_depth > 8);
+    }
+
+    let dest_scale = (1 << (bit_depth - 8)) as f32;
+    let src_scale = 1.0 / dest_scale;
+    let peak = T::cast_from((1u16 << bit_depth) - 1);
+
+    let mut pad_dimensions = ArrayVec::<_, 3>::new();
+    let mut effective_heights = ArrayVec::<_, 3>::new();
+    for plane in 0..3 {
+      let ae = (SB_SIZE - SO_SIZE).max(SO_SIZE) * 2;
+      let (width, height) = if plane == 0 {
+        (width, height)
+      } else {
+        chroma_sampling.get_chroma_dimensions(width, height)
+      };
+      let pad_w = width + extra(width, SB_SIZE) + ae;
+      let pad_h = height + extra(height, SB_SIZE) + ae;
+      let e_h =
+        ((pad_h - SO_SIZE) / (SB_SIZE - SO_SIZE)) * (SB_SIZE - SO_SIZE);
+      pad_dimensions.push((pad_w, pad_h));
+      effective_heights.push(e_h);
+    }
+
+    let hw = Aligned::new(Self::create_window());
+    let mut dftgr = Aligned::new([0f32; BLOCK_VOLUME]);
+
+    let fft = (
+      R2cFftHandler::new(SB_SIZE),
+      FftHandler::new(SB_SIZE),
+      FftHandler::new(TB_SIZE),
+    );
+
+    let mut wscale = 0.0f32;
+    for k in 0..BLOCK_VOLUME {
+      dftgr[k] = 255.0 * hw[k];
+      wscale += hw[k].powi(2);
+    }
+    let wscale = 1.0 / wscale;
+
+    let mut sigmas = Aligned::new([0f32; CCNT2]);
+    sigmas.fill(sigma / wscale);
+
+    let mut denoiser = DftDenoiser {
+      chroma_sampling,
+      dest_scale,
+      src_scale,
+      peak,
+      pad_dimensions,
+      effective_heights,
+      hw,
+      fft,
+      sigmas,
+      dftgc: Aligned::new([Complex::default(); COMPLEX_COUNT]),
+      frame_buffer: VecDeque::with_capacity(TB_MIDPOINT),
+      cur_frameno: 0,
+    };
+
+    let mut dftgc = Aligned::new([Complex::default(); COMPLEX_COUNT]);
+    denoiser.real_to_complex_3d(&dftgr, &mut dftgc);
+    denoiser.dftgc = dftgc;
+
+    denoiser
+  }
+
+  pub fn filter_frame(
+    &mut self, frame_q: &FrameQueue<T>,
+  ) -> Result<Frame<T>, EncoderStatus> {
+    if self.frame_buffer.len() < TB_MIDPOINT.min(self.cur_frameno as usize) {
+      // We need to have the previous unfiltered frame
+      // in the buffer for temporal filtering.
+      return Err(EncoderStatus::NeedMoreData);
+    }
+    let future_frames = frame_q
+      .range((self.cur_frameno + 1)..)
+      .take(TB_MIDPOINT)
+      .map(|(_, f)| f)
+      .collect::<ArrayVec<_, TB_MIDPOINT>>();
+    if future_frames.len() != TB_MIDPOINT
+      && !future_frames.iter().any(|f| f.is_none())
+    {
+      // We also need to have the next unfiltered frame,
+      // unless we are at the end of the video.
+      return Err(EncoderStatus::NeedMoreData);
+    }
+
+    let orig_frame = frame_q.get(&self.cur_frameno).unwrap().as_ref().unwrap();
+    let frames = self
+      .frame_buffer
+      .iter()
+      .cloned()
+      .enumerate()
+      .chain(once(((TB_MIDPOINT), Arc::clone(orig_frame))))
+      .chain(
+        future_frames
+          .into_iter()
+          .flatten()
+          .cloned()
+          .enumerate()
+          .map(|(i, f)| (i + 1 + TB_MIDPOINT, f)),
+      )
+      .collect::<BTreeMap<_, _>>();
+
+    let mut dest = (**orig_frame).clone();
+    let mut pad = ArrayVec::<_, TB_SIZE>::new();
+    for i in 0..TB_SIZE {
+      let dec = self.chroma_sampling.get_decimation().unwrap_or((0, 0));
+      let mut pad_frame = [
+        Plane::new(
+          self.pad_dimensions[0].0,
+          self.pad_dimensions[0].1,
+          0,
+          0,
+          0,
+          0,
+        ),
+        Plane::new(
+          self.pad_dimensions[1].0,
+          self.pad_dimensions[1].1,
+          dec.0,
+          dec.1,
+          0,
+          0,
+        ),
+        Plane::new(
+          self.pad_dimensions[2].0,
+          self.pad_dimensions[2].1,
+          dec.0,
+          dec.1,
+          0,
+          0,
+        ),
+      ];
+
+      let frame = frames.get(&i).unwrap_or(&frames[&TB_MIDPOINT]);
+      self.copy_pad(frame, &mut pad_frame);
+      pad.push(pad_frame);
+    }
+    self.do_filtering(&pad, &mut dest);
+
+    if self.frame_buffer.len() == TB_MIDPOINT {
+      self.frame_buffer.pop_front();
+    }
+    self.frame_buffer.push_back(Arc::clone(orig_frame));
+    self.cur_frameno += 1;
+
+    Ok(dest)
+  }
+
+  fn do_filtering(&mut self, src: &[[Plane<T>; 3]], dest: &mut Frame<T>) {
+    let mut dftr = [0f32; BLOCK_VOLUME];
+    let mut dftc = [Complex::<f32>::default(); COMPLEX_COUNT];
+    let mut means = [Complex::<f32>::default(); COMPLEX_COUNT];
+
+    for p in 0..3 {
+      let (pad_width, pad_height) = self.pad_dimensions[p];
+      let mut ebuff = vec![0f32; pad_width * pad_height];
+      let effective_height = self.effective_heights[p];
+      let src_stride = src[0][p].cfg.stride;
+      let ebuff_stride = pad_width;
+
+      let mut src_planes = src
+        .iter()
+        .map(|f| f[p].data_origin())
+        .collect::<ArrayVec<_, TB_SIZE>>();
+
+      // SAFETY: We know the size of the planes we're working on,
+      // so we can safely ensure we are not out of bounds.
+      // There are a fair number of unsafe function calls here
+      // which are unsafe for optimization purposes.
+      // All are safe as long as we do not pass out-of-bounds parameters.
+      unsafe {
+        for y in (0..effective_height).step_by(INC) {
+          for x in (0..=(pad_width - SB_SIZE)).step_by(INC) {
+            for z in 0..TB_SIZE {
+              self.proc0(
+                &src_planes[z][x..],
+                &self.hw[(BLOCK_AREA * z)..],
+                &mut dftr[(BLOCK_AREA * z)..],
+                src_stride,
+                SB_SIZE,
+                self.src_scale,
+              );
+            }
+
+            self.real_to_complex_3d(&dftr, &mut dftc);
+            self.remove_mean(&mut dftc, &self.dftgc, &mut means);
+
+            self.filter_coeffs(&mut dftc);
+
+            self.add_mean(&mut dftc, &means);
+            self.complex_to_real_3d(&dftc, &mut dftr);
+
+            self.proc1(
+              &dftr[(TB_MIDPOINT * BLOCK_AREA)..],
+              &self.hw[(TB_MIDPOINT * BLOCK_AREA)..],
+              &mut ebuff[(y * ebuff_stride + x)..],
+              SB_SIZE,
+              ebuff_stride,
+            );
+          }
+
+          for q in 0..TB_SIZE {
+            src_planes[q] = &src_planes[q][(INC * src_stride)..];
+          }
+        }
+      }
+
+      let dest_width = dest.planes[p].cfg.width;
+      let dest_height = dest.planes[p].cfg.height;
+      let dest_stride = dest.planes[p].cfg.stride;
+      let dest_plane = dest.planes[p].data_origin_mut();
+      let ebp_offset = ebuff_stride * ((pad_height - dest_height) / 2)
+        + (pad_width - dest_width) / 2;
+      let ebp = &ebuff[ebp_offset..];
+
+      self.cast(
+        ebp,
+        dest_plane,
+        dest_width,
+        dest_height,
+        dest_stride,
+        ebuff_stride,
+      );
+    }
+  }
+
+  fn create_window() -> [f32; BLOCK_VOLUME] {
+    let mut hw = [0f32; BLOCK_VOLUME];
+    let mut tw = [0f64; TB_SIZE];
+    let mut sw = [0f64; SB_SIZE];
+
+    tw.fill_with(Self::temporal_window);
+    sw.iter_mut().enumerate().for_each(|(j, sw)| {
+      *sw = Self::spatial_window(j as f64 + 0.5);
+    });
+    Self::normalize_for_overlap_add(&mut sw);
+
+    let nscale = 1.0 / (BLOCK_VOLUME as f64).sqrt();
+    for j in 0..TB_SIZE {
+      for k in 0..SB_SIZE {
+        for q in 0..SB_SIZE {
+          hw[(j * SB_SIZE + k) * SB_SIZE + q] =
+            (tw[j] * sw[k] * sw[q] * nscale) as f32;
+        }
+      }
+    }
+
+    hw
+  }
+
+  #[inline(always)]
+  // Hanning windowing
+  fn spatial_window(n: f64) -> f64 {
+    0.5 - 0.5 * (2.0 * PI * n / SB_SIZE as f64).cos()
+  }
+
+  #[inline(always)]
+  // Simple rectangular windowing
+  fn temporal_window() -> f64 {
+    1.0
+  }
+
+  // Accounts for spatial block overlap
+  fn normalize_for_overlap_add(hw: &mut [f64]) {
+    let inc = SB_SIZE - SO_SIZE;
+
+    let mut nw = [0f64; SB_SIZE];
+    let hw = &mut hw[..SB_SIZE];
+
+    for q in 0..SB_SIZE {
+      for h in (0..=q).rev().step_by(inc) {
+        nw[q] += hw[h].powi(2);
+      }
+      for h in ((q + inc)..SB_SIZE).step_by(inc) {
+        nw[q] += hw[h].powi(2);
+      }
+    }
+
+    for q in 0..SB_SIZE {
+      hw[q] /= nw[q].sqrt();
+    }
+  }
+
+  #[inline]
+  unsafe fn proc0(
+    &self, s0: &[T], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
+    src_scale: f32,
+  ) {
+    let s0 = s0.as_ptr();
+    let s1 = s1.as_ptr();
+    let dest = dest.as_mut_ptr();
+
+    for u in 0..p1 {
+      for v in 0..p1 {
+        let s0 = s0.add(u * p0 + v);
+        let s1 = s1.add(u * p1 + v);
+        let dest = dest.add(u * p1 + v);
+        dest.write(u16::cast_from(s0.read()) as f32 * src_scale * s1.read())
+      }
+    }
+  }
+
+  #[inline]
+  unsafe fn proc1(
+    &self, s0: &[f32], s1: &[f32], dest: &mut [f32], p0: usize, p1: usize,
+  ) {
+    let s0 = s0.as_ptr();
+    let s1 = s1.as_ptr();
+    let dest = dest.as_mut_ptr();
+
+    for u in 0..p0 {
+      for v in 0..p0 {
+        let s0 = s0.add(u * p0 + v);
+        let s1 = s1.add(u * p0 + v);
+        let dest = dest.add(u * p1 + v);
+        dest.write(s0.read().mul_add(s1.read(), dest.read()));
+      }
+    }
+  }
+
+  #[inline]
+  fn remove_mean(
+    &self, dftc: &mut [Complex<f32>; COMPLEX_COUNT],
+    dftgc: &[Complex<f32>; COMPLEX_COUNT],
+    means: &mut [Complex<f32>; COMPLEX_COUNT],
+  ) {
+    let gf = dftc[0].re / dftgc[0].re;
+
+    for h in 0..COMPLEX_COUNT {
+      means[h].re = gf * dftgc[h].re;
+      means[h].im = gf * dftgc[h].im;
+      dftc[h].re -= means[h].re;
+      dftc[h].im -= means[h].im;
+    }
+  }
+
+  #[inline]
+  fn add_mean(
+    &self, dftc: &mut [Complex<f32>; COMPLEX_COUNT],
+    means: &[Complex<f32>; COMPLEX_COUNT],
+  ) {
+    for h in 0..COMPLEX_COUNT {
+      dftc[h].re += means[h].re;
+      dftc[h].im += means[h].im;
+    }
+  }
+
+  #[inline]
+  // Applies a generalized wiener filter
+  fn filter_coeffs(&self, dftc: &mut [Complex<f32>; COMPLEX_COUNT]) {
+    for h in 0..COMPLEX_COUNT {
+      let psd = dftc[h].re.mul_add(dftc[h].re, dftc[h].im.powi(2));
+      let mult = ((psd - self.sigmas[h]) / (psd + 1e-15)).max(0.0);
+      dftc[h].re *= mult;
+      dftc[h].im *= mult;
+    }
+  }
+
+  fn copy_pad(&self, src: &Frame<T>, dest: &mut [Plane<T>; 3]) {
+    for p in 0..src.planes.len() {
+      let src_width = src.planes[p].cfg.width;
+      let dest_width = dest[p].cfg.width;
+      let src_height = src.planes[p].cfg.height;
+      let dest_height = dest[p].cfg.height;
+      let src_stride = src.planes[p].cfg.stride;
+      let dest_stride = dest[p].cfg.stride;
+
+      let offy = (dest_height - src_height) / 2;
+      let offx = (dest_width - src_width) / 2;
+
+      bitblt(
+        &mut dest[p].data_origin_mut()[(dest_stride * offy + offx)..],
+        dest_stride,
+        src.planes[p].data_origin(),
+        src_stride,
+        src_width,
+        src_height,
+      );
+
+      let mut dest_ptr =
+        &mut dest[p].data_origin_mut()[(dest_stride * offy)..];
+      for _ in offy..(src_height + offy) {
+        let dest_slice = &mut dest_ptr[..dest_width];
+
+        let mut w = offx * 2;
+        for x in 0..offx {
+          dest_slice[x] = dest_slice[w];
+          w -= 1;
+        }
+
+        w = offx + src_width - 2;
+        for x in (offx + src_width)..dest_width {
+          dest_slice[x] = dest_slice[w];
+          w -= 1;
+        }
+
+        dest_ptr = &mut dest_ptr[dest_stride..];
+      }
+
+      let dest_origin = dest[p].data_origin_mut();
+      let mut w = offy * 2;
+      for y in 0..offy {
+        // SAFETY: `copy_from_slice` has borrow checker issues here
+        // because we are copying from `dest` to `dest`, but we manually
+        // know that the two slices will not overlap. We still slice
+        // the start and end as a safety check.
+        unsafe {
+          copy_nonoverlapping(
+            dest_origin[(dest_stride * w)..][..dest_width].as_ptr(),
+            dest_origin[(dest_stride * y)..][..dest_width].as_mut_ptr(),
+            dest_width,
+          );
+        }
+        w -= 1;
+      }
+
+      w = offy + src_height - 2;
+      for y in (offy + src_height)..dest_height {
+        // SAFETY: `copy_from_slice` has borrow checker issues here
+        // because we are copying from `dest` to `dest`, but we manually
+        // know that the two slices will not overlap. We still slice
+        // the start and end as a safety check.
+        unsafe {
+          copy_nonoverlapping(
+            dest_origin[(dest_stride * w)..][..dest_width].as_ptr(),
+            dest_origin[(dest_stride * y)..][..dest_width].as_mut_ptr(),
+            dest_width,
+          );
+        }
+        w -= 1;
+      }
+    }
+  }
+
+  fn cast(
+    &self, ebuff: &[f32], dest: &mut [T], dest_width: usize,
+    dest_height: usize, dest_stride: usize, ebp_stride: usize,
+  ) {
+    let ebuff = ebuff.chunks(ebp_stride);
+    let dest = dest.chunks_mut(dest_stride);
+
+    for (ebuff, dest) in ebuff.zip(dest).take(dest_height) {
+      for x in 0..dest_width {
+        let fval = ebuff[x].mul_add(self.dest_scale, 0.5);
+        dest[x] =
+          clamp(T::cast_from(fval as u16), T::cast_from(0u16), self.peak);
+      }
+    }
+  }
+
+  // Applies a real-to-complex 3-dimensional FFT to `real`
+  fn real_to_complex_3d(
+    &mut self, real: &[f32; BLOCK_VOLUME],
+    output: &mut [Complex<f32>; COMPLEX_COUNT],
+  ) {
+    let input =
+      ArrayView3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE), real).unwrap();
+    let mut temp1 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1));
+    let mut temp2 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1));
+    let mut output =
+      ArrayViewMut3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1), output)
+        .unwrap();
+
+    ndfft_r2c(&input, &mut temp1, &mut self.fft.0, 2);
+    ndfft(&temp1, &mut temp2, &mut self.fft.1, 1);
+    ndfft(&temp2, &mut output, &mut self.fft.2, 0);
+  }
+
+  // Applies a complex-to-real 3-dimensional FFT to `complex`
+  fn complex_to_real_3d(
+    &mut self, complex: &[Complex<f32>; COMPLEX_COUNT],
+    output: &mut [f32; BLOCK_VOLUME],
+  ) {
+    let input =
+      ArrayView3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1), complex)
+        .unwrap();
+    let mut temp0 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1));
+    let mut temp1 = Array3::zeros((TB_SIZE, SB_SIZE, SB_SIZE / 2 + 1));
+    let mut output =
+      ArrayViewMut3::from_shape((TB_SIZE, SB_SIZE, SB_SIZE), output).unwrap();
+
+    ndifft(&input, &mut temp0, &mut self.fft.2, 0);
+    ndifft(&temp0, &mut temp1, &mut self.fft.1, 1);
+    ndifft_r2c(&temp1, &mut output, &mut self.fft.0, 2);
+    output.iter_mut().for_each(|d| {
+      *d *= BLOCK_VOLUME as f32;
+    });
+  }
+}
+
+#[inline(always)]
+fn extra(a: usize, b: usize) -> usize {
+  if a % b > 0 {
+    b - (a % b)
+  } else {
+    0
+  }
+}
+
+// Identical to Vapoursynth's implementation `vs_bitblt`
+// which basically copies the pixels in a plane.
+fn bitblt<T: Pixel>(
+  mut dest: &mut [T], dest_stride: usize, mut src: &[T], src_stride: usize,
+  width: usize, height: usize,
+) {
+  if src_stride == dest_stride && src_stride == width {
+    dest[..(width * height)].copy_from_slice(&src[..(width * height)]);
+  } else {
+    for _ in 0..height {
+      dest[..width].copy_from_slice(&src[..width]);
+      src = &src[src_stride..];
+      dest = &mut dest[dest_stride..];
+    }
+  }
+}
diff --git a/src/fuzzing.rs b/src/fuzzing.rs
index aab9abe059..d440f3f88f 100644
--- a/src/fuzzing.rs
+++ b/src/fuzzing.rs
@@ -257,6 +257,7 @@ impl Arbitrary for ArbitraryEncoder {
       switch_frame_interval: u.int_in_range(0..=3)?,
       tune: *u.choose(&[Tune::Psnr, Tune::Psychovisual])?,
       film_grain_params: None,
+      denoise_strength: u.int_in_range(0..=50)?,
     };
 
     let frame_count =
diff --git a/src/lib.rs b/src/lib.rs
index 3425588db4..05e2270115 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -257,6 +257,7 @@ mod cdef;
 #[doc(hidden)]
 pub mod context;
 mod deblock;
+mod denoise;
 mod encoder;
 mod entropymode;
 mod lrf;
diff --git a/src/test_encode_decode/mod.rs b/src/test_encode_decode/mod.rs
index 9e6082ed55..8bdecca838 100644
--- a/src/test_encode_decode/mod.rs
+++ b/src/test_encode_decode/mod.rs
@@ -10,9 +10,8 @@
 // Fuzzing only uses a subset of these.
 #![cfg_attr(fuzzing, allow(unused))]
 
-use crate::color::ChromaSampling;
-
 use crate::api::config::GrainTableSegment;
+use crate::color::ChromaSampling;
 use crate::util::Pixel;
 use crate::*;
 
diff --git a/src/util/align.rs b/src/util/align.rs
index c86424e8b2..02928698cd 100644
--- a/src/util/align.rs
+++ b/src/util/align.rs
@@ -42,6 +42,20 @@ impl<T> Aligned<T> {
   }
 }
 
+impl<T> std::ops::Deref for Aligned<T> {
+  type Target = T;
+
+  fn deref(&self) -> &T {
+    &self.data
+  }
+}
+
+impl<T> std::ops::DerefMut for Aligned<T> {
+  fn deref_mut(&mut self) -> &mut T {
+    &mut self.data
+  }
+}
+
 /// An analog to a Box<[T]> where the underlying slice is aligned.
 /// Alignment is according to the architecture-specific SIMD constraints.
 pub struct AlignedBoxedSlice<T> {