From 0b0308b8c1b59ad98ef3a3845d317e4239725a16 Mon Sep 17 00:00:00 2001 From: Zen <46526140+master-of-zen@users.noreply.github.com> Date: Tue, 6 Apr 2021 22:30:31 +0300 Subject: [PATCH] Improve scene detection --- src/api/internal.rs | 5 +- src/scenechange/mod.rs | 398 ++++++++++++++++++++--------------------- 2 files changed, 199 insertions(+), 204 deletions(-) diff --git a/src/api/internal.rs b/src/api/internal.rs index bf3ddc2270..6e3eb4bc44 100644 --- a/src/api/internal.rs +++ b/src/api/internal.rs @@ -246,7 +246,7 @@ pub(crate) struct ContextInner { gop_output_frameno_start: BTreeMap, /// Maps `output_frameno` to `gop_input_frameno_start`. pub(crate) gop_input_frameno_start: BTreeMap, - keyframe_detector: SceneChangeDetector, + keyframe_detector: SceneChangeDetector, pub(crate) config: Arc, seq: Arc, pub(crate) rc_state: RCState, @@ -271,7 +271,6 @@ impl ContextInner { let seq = Arc::new(Sequence::new(enc)); let inter_cfg = InterConfig::new(enc); - let lookahead_distance = inter_cfg.keyframe_lookahead_distance() as usize; ContextInner { frame_count: 0, @@ -289,9 +288,7 @@ impl ContextInner { keyframe_detector: SceneChangeDetector::new( *enc, CpuFeatureLevel::default(), - lookahead_distance, seq.clone(), - true, ), config: Arc::new(*enc), seq, diff --git a/src/scenechange/mod.rs b/src/scenechange/mod.rs index 7ea001acc8..39ca9464f3 100644 --- a/src/scenechange/mod.rs +++ b/src/scenechange/mod.rs @@ -14,34 +14,35 @@ use crate::encoder::Sequence; use crate::frame::*; use crate::util::{CastFromPrimitive, Pixel}; use rust_hawktracer::*; -use std::collections::BTreeSet; +use std::cmp; use std::sync::Arc; /// Runs keyframe detection on frames from the lookahead queue. -pub struct SceneChangeDetector { +pub struct SceneChangeDetector { /// Minimum average difference between YUV deltas that will trigger a scene change. - threshold: u64, + threshold: usize, /// Fast scene cut detection mode, uses simple SAD instead of encoder cost estimates. fast_mode: bool, - /// Determine whether or not short scene flashes should be excluded - exclude_scene_flashes: bool, - /// Frames that cannot be marked as keyframes due to the algorithm excluding them. - /// Storing the frame numbers allows us to avoid looking back more than one frame. - excluded_frames: BTreeSet, + /// scaling factor for fast scene detection + scale_factor: usize, + // Frame buffer for scaled frames + frame_buffer: Vec>, + // Scenechange results for adaptive threshold + score_deque: Vec, + /// Number of pixels in scaled frame for fast mode + pixels: usize, /// The bit depth of the video. bit_depth: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, encoder_config: EncoderConfig, - lookahead_distance: usize, sequence: Arc, } -impl SceneChangeDetector { +impl SceneChangeDetector { pub fn new( encoder_config: EncoderConfig, cpu_feature_level: CpuFeatureLevel, - lookahead_distance: usize, sequence: Arc, - exclude_scene_flashes: bool, + sequence: Arc, ) -> Self { // This implementation is based on a Python implementation at // https://pyscenedetect.readthedocs.io/en/latest/reference/detection-methods/. @@ -54,20 +55,37 @@ impl SceneChangeDetector { // This may be adjusted later. // // This threshold is only used for the fast scenecut implementation. - const BASE_THRESHOLD: u64 = 12; + const BASE_THRESHOLD: usize = 25; let bit_depth = encoder_config.bit_depth; let fast_mode = encoder_config.speed_settings.fast_scene_detection || encoder_config.low_latency; + // Scale factor for fast scene detection + let scale_factor = + if fast_mode { detect_scale_factor(&sequence) } else { 1_usize }; + + let score_deque = Vec::with_capacity(5); + // Pixel count for fast scenedetect + + let pixels = if fast_mode { + (sequence.max_frame_height as usize / scale_factor) + * (sequence.max_frame_width as usize / scale_factor) + } else { + 1 + }; + + let frame_buffer = Vec::with_capacity(2); + Self { - threshold: BASE_THRESHOLD * bit_depth as u64 / 8, + threshold: BASE_THRESHOLD * bit_depth / 8, fast_mode, - exclude_scene_flashes, - excluded_frames: BTreeSet::new(), + scale_factor, + frame_buffer, + score_deque, + pixels, bit_depth, cpu_feature_level, encoder_config, - lookahead_distance, sequence, } } @@ -81,14 +99,14 @@ impl SceneChangeDetector { /// /// This will gracefully handle the first frame in the video as well. #[hawktracer(analyze_next_frame)] - pub fn analyze_next_frame( + pub fn analyze_next_frame( &mut self, frame_set: &[Arc>], input_frameno: u64, previous_keyframe: u64, ) -> bool { // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; - // Handle minimum and maximum key frame intervals. + // Handle minimum and maximum keyframe intervals. if distance < self.encoder_config.min_key_frame_interval { return false; } @@ -100,38 +118,21 @@ impl SceneChangeDetector { return false; } - if self.exclude_scene_flashes { - self.exclude_scene_flashes(frame_set, input_frameno, previous_keyframe); - } - - self.is_key_frame( - frame_set[0].clone(), - frame_set[1].clone(), - input_frameno, - previous_keyframe, - ) - } - - /// Determines if `current_frame` should be a keyframe. - fn is_key_frame( - &self, previous_frame: Arc>, current_frame: Arc>, - current_frameno: u64, previous_keyframe: u64, - ) -> bool { - if self.excluded_frames.contains(¤t_frameno) { - return false; - } + // Set our scenecut method + let result = if self.fast_mode { + self.fast_scenecut(frame_set[0].clone(), frame_set[1].clone()) + } else { + self.cost_scenecut( + frame_set[0].clone(), + frame_set[1].clone(), + input_frameno, + previous_keyframe, + ) + }; - let result = self.has_scenecut( - previous_frame, - current_frame, - current_frameno, - previous_keyframe, - ); debug!( - "[SC-Detect] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - current_frameno - 1, - current_frameno, - result.intra_cost, + "[SC-Detect] Frame {}: T={:.1} P={:.1} {}", + input_frameno, result.threshold, result.inter_cost, if result.has_scenecut { "Scenecut" } else { "No cut" } @@ -139,176 +140,149 @@ impl SceneChangeDetector { result.has_scenecut } - /// Uses lookahead to avoid coding short flashes as scenecuts. - /// Saves excluded frame numbers in `self.excluded_frames`. - fn exclude_scene_flashes( - &mut self, frame_subset: &[Arc>], frameno: u64, - previous_keyframe: u64, - ) { - let lookahead_distance = self.lookahead_distance; - - if frame_subset.len() - 1 < lookahead_distance { - // Don't add a keyframe in the last frame pyramid. - // It's effectively the same as a scene flash, - // and really wasteful for compression. - for frame in frameno..=(frameno + lookahead_distance as u64) { - self.excluded_frames.insert(frame); - } - return; - } + /// Compares current scene score to adapted threshold based on previous scores + /// Returns true if current scene score is higher than adapted threshold + fn adaptive_scenecut(&mut self, scene_score: f64) -> bool { + if self.score_deque.is_empty() { + true // we skip high delta on first frame comparision as it's probably inside flashing or high motion scene + } else { + let max_of_deque: f64 = self + .score_deque + .iter() + .cloned() + .fold(-1. / 0. /* -inf */, f64::max); // max of last n(5) frames - // Where A and B are scenes: AAAAAABBBAAAAAA - // If BBB is shorter than lookahead_distance, it is detected as a flash - // and not considered a scenecut. - // - // Search starting with the furthest frame, - // to enable early loop exit if we find a scene flash. - for j in (1..=lookahead_distance).rev() { - let result = self.has_scenecut( - frame_subset[0].clone(), - frame_subset[j].clone(), - frameno - 1 + j as u64, - previous_keyframe, - ); + // + let scenecut = scene_score > self.threshold as f64 + max_of_deque; debug!( - "[SF-Detect-1] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1, - frameno - 1 + j as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "No flash" } else { "Scene flash" } + "[SC-Detect] P: {:.1} {:.1?} Cut: {}", + scene_score, self.score_deque, scenecut ); - if !result.has_scenecut { - // Any frame in between `0` and `j` cannot be a real scenecut. - for i in 0..=j { - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); - } - // Because all frames in this gap are already excluded, - // exit the loop early as an optimization. - break; - } + scenecut } + } - // Where A-F are scenes: AAAAABBCCDDEEFFFFFF - // If each of BB ... EE are shorter than `lookahead_distance`, they are - // detected as flashes and not considered scenecuts. - // Instead, the first F frame becomes a scenecut. - // If the video ends before F, no frame becomes a scenecut. - for i in 1..lookahead_distance { - let result = self.has_scenecut( - frame_subset[i].clone(), - frame_subset[lookahead_distance].clone(), - frameno - 1 + lookahead_distance as u64, - previous_keyframe, - ); - debug!( - "[SF-Detect-2] Frame {} to {}: I={:.3} T={:.3} P={:.3} {}", - frameno - 1 + i as u64, - frameno - 1 + lookahead_distance as u64, - result.intra_cost, - result.threshold, - result.inter_cost, - if result.has_scenecut { "Scene flash" } else { "No flash" } - ); - if result.has_scenecut { - // If the current frame is the frame before a scenecut, it cannot also be the frame of a scenecut. - let frameno = frameno + i as u64 - 1; - self.excluded_frames.insert(frameno); + /// The fast algorithm detects fast cuts using a raw difference + /// in pixel values between the scaled frames. + #[hawktracer(fast_scenecut)] + fn fast_scenecut( + &mut self, frame1: Arc>, frame2: Arc>, + ) -> ScenecutResult { + // Downscaling both frames for comparison + // Moving scaled frames to buffer + if self.frame_buffer.is_empty() { + let frame1_scaled = frame1.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame1_scaled); + + let frame2_scaled = frame2.planes[0].downscale(self.scale_factor); + self.frame_buffer.push(frame2_scaled); + } else { + self.frame_buffer.remove(0); + self.frame_buffer.push(frame2.planes[0].downscale(self.scale_factor)); + } + + let delta = + self.delta_in_planes(&self.frame_buffer[0], &self.frame_buffer[1]); + + // Adaptive scenecut check; + let scenecut = + delta >= self.threshold as f64 && self.adaptive_scenecut(delta); + + if scenecut { + // Clear buffers + self.frame_buffer.clear(); + self.score_deque.clear(); + } else { + // Keep score deque 5 frames + self.score_deque.push(delta as f64); + if self.score_deque.len() > 5 { + self.score_deque.remove(0); } } + + ScenecutResult { + intra_cost: self.threshold as f64, + threshold: self.threshold as f64, + inter_cost: delta as f64, + has_scenecut: scenecut, + } } /// Run a comparison between two frames to determine if they qualify for a scenecut. /// - /// The standard algorithm uses block intra and inter costs + /// Using block intra and inter costs /// to determine which method would be more efficient /// for coding this frame. - /// - /// The fast algorithm detects fast cuts using a raw difference - /// in pixel values between the frames. - /// It does not handle pans well, but the scene flash detection compensates for this - /// in many cases. - fn has_scenecut( + #[hawktracer(cost_scenecut)] + fn cost_scenecut( &self, frame1: Arc>, frame2: Arc>, frameno: u64, previous_keyframe: u64, ) -> ScenecutResult { - if self.fast_mode { - let len = frame2.planes[0].cfg.width * frame2.planes[0].cfg.height; - let delta = self.delta_in_planes(&frame1.planes[0], &frame2.planes[0]); - let threshold = self.threshold * len as u64; - ScenecutResult { - intra_cost: threshold as f64, - threshold: threshold as f64, - inter_cost: delta as f64, - has_scenecut: delta >= threshold, - } - } else { - let frame2_ref2 = Arc::clone(&frame2); - let (intra_cost, inter_cost) = crate::rayon::join( - move || { - let intra_costs = estimate_intra_costs( - &*frame2, - self.bit_depth, - self.cpu_feature_level, - ); - intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / intra_costs.len() as f64 - }, - move || { - let inter_costs = estimate_inter_costs( - frame2_ref2, - frame1, - self.bit_depth, - self.encoder_config, - self.sequence.clone(), - ); - inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 - / inter_costs.len() as f64 - }, - ); + let frame2_ref2 = Arc::clone(&frame2); + let (intra_cost, inter_cost) = crate::rayon::join( + move || { + let intra_costs = estimate_intra_costs( + &*frame2, + self.bit_depth, + self.cpu_feature_level, + ); + intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / intra_costs.len() as f64 + }, + move || { + let inter_costs = estimate_inter_costs( + frame2_ref2, + frame1, + self.bit_depth, + self.encoder_config, + self.sequence.clone(), + ); + inter_costs.iter().map(|&cost| cost as u64).sum::() as f64 + / inter_costs.len() as f64 + }, + ); - // Sliding scale, more likely to choose a keyframe - // as we get farther from the last keyframe. - // Based on x264 scenecut code. - // - // `THRESH_MAX` determines how likely we are - // to choose a keyframe, between 0.0-1.0. - // Higher values mean we are more likely to choose a keyframe. - // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, - // as it appeared to provide the best average compression. - // This also matches the default scenecut threshold in x264. - const THRESH_MAX: f64 = 0.4; - const THRESH_MIN: f64 = THRESH_MAX * 0.25; - let distance_from_keyframe = frameno - previous_keyframe; - let min_keyint = self.encoder_config.min_key_frame_interval; - let max_keyint = self.encoder_config.max_key_frame_interval; - let bias = if distance_from_keyframe <= min_keyint / 4 { - THRESH_MIN / 4.0 - } else if distance_from_keyframe <= min_keyint { - THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 - } else { - THRESH_MIN - + (THRESH_MAX - THRESH_MIN) - * (distance_from_keyframe - min_keyint) as f64 - / (max_keyint - min_keyint) as f64 - }; - let threshold = intra_cost * (1.0 - bias); + // Sliding scale, more likely to choose a keyframe + // as we get farther from the last keyframe. + // Based on x264 scenecut code. + // + // `THRESH_MAX` determines how likely we are + // to choose a keyframe, between 0.0-1.0. + // Higher values mean we are more likely to choose a keyframe. + // `0.4` was chosen based on trials of the `scenecut-720p` set in AWCY, + // as it appeared to provide the best average compression. + // This also matches the default scenecut threshold in x264. + const THRESH_MAX: f64 = 0.4; + const THRESH_MIN: f64 = THRESH_MAX * 0.25; + let distance_from_keyframe = frameno - previous_keyframe; + let min_keyint = self.encoder_config.min_key_frame_interval; + let max_keyint = self.encoder_config.max_key_frame_interval; + let bias = if distance_from_keyframe <= min_keyint / 4 { + THRESH_MIN / 4.0 + } else if distance_from_keyframe <= min_keyint { + THRESH_MIN * distance_from_keyframe as f64 / min_keyint as f64 + } else { + THRESH_MIN + + (THRESH_MAX - THRESH_MIN) + * (distance_from_keyframe - min_keyint) as f64 + / (max_keyint - min_keyint) as f64 + }; + let threshold = intra_cost * (1.0 - bias); - ScenecutResult { - intra_cost, - threshold, - inter_cost, - has_scenecut: inter_cost > threshold, - } + ScenecutResult { + intra_cost, + threshold, + inter_cost, + has_scenecut: inter_cost > threshold, } } - fn delta_in_planes( - &self, plane1: &Plane, plane2: &Plane, - ) -> u64 { + /// Calculates delta beetween 2 planes + /// returns average for pixel + #[hawktracer(delta_in_planes)] + fn delta_in_planes(&self, plane1: &Plane, plane2: &Plane) -> f64 { let mut delta = 0; + let lines = plane1.rows_iter().zip(plane2.rows_iter()); for (l1, l2) in lines { @@ -316,15 +290,39 @@ impl SceneChangeDetector { .iter() .zip(l2.iter()) .map(|(&p1, &p2)| { - (i16::cast_from(p1) - i16::cast_from(p2)).abs() as u64 + (i16::cast_from(p1) - i16::cast_from(p2)).abs() as usize }) - .sum::(); + .sum::(); delta += delta_line; } - delta + delta as f64 / self.pixels as f64 } } +/// Scaling factor for frame in scenedetection +fn detect_scale_factor(sequence: &Arc) -> usize { + let small_edge = + cmp::min(sequence.max_frame_height, sequence.max_frame_width) as usize; + let scale_factor = match small_edge { + 0..=240 => 1, + 241..=480 => 2, + 481..=720 => 4, + 721..=1080 => 8, + 1081..=1600 => 16, + 1601..=std::usize::MAX => 32, + _ => 1, + } as usize; + debug!( + "Scene detection scale factor {}, [{},{}] -> [{},{}]", + scale_factor, + sequence.max_frame_width, + sequence.max_frame_height, + sequence.max_frame_width as usize / scale_factor, + sequence.max_frame_height as usize / scale_factor + ); + scale_factor +} + /// This struct primarily exists for returning metrics to the caller /// for logging debug information. #[derive(Debug, Clone, Copy)]