| 1 | // Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved |
| 2 | // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved |
| 3 | // |
| 4 | // This source code is subject to the terms of the BSD 2 Clause License and |
| 5 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 6 | // was not distributed with this source code in the LICENSE file, you can |
| 7 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 8 | // Media Patent License 1.0 was not distributed with this source code in the |
| 9 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 10 | |
| 11 | #![allow (non_camel_case_types)] |
| 12 | |
| 13 | use crate::api::*; |
| 14 | use crate::cdef::*; |
| 15 | use crate::context::*; |
| 16 | use crate::cpu_features::CpuFeatureLevel; |
| 17 | use crate::deblock::*; |
| 18 | use crate::dist::*; |
| 19 | use crate::ec::{Writer, WriterCounter, OD_BITRES}; |
| 20 | use crate::encode_block_with_modes; |
| 21 | use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE}; |
| 22 | use crate::frame::Frame; |
| 23 | use crate::frame::*; |
| 24 | use crate::header::ReferenceMode; |
| 25 | use crate::lrf::*; |
| 26 | use crate::mc::MotionVector; |
| 27 | use crate::me::estimate_motion; |
| 28 | use crate::me::MVSamplingMode; |
| 29 | use crate::me::MotionSearchResult; |
| 30 | use crate::motion_compensate; |
| 31 | use crate::partition::PartitionType::*; |
| 32 | use crate::partition::RefType::*; |
| 33 | use crate::partition::*; |
| 34 | use crate::predict::{ |
| 35 | luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode, |
| 36 | RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES, |
| 37 | }; |
| 38 | use crate::rdo_tables::*; |
| 39 | use crate::tiling::*; |
| 40 | use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES}; |
| 41 | use crate::util::{init_slice_repeat_mut, Aligned, Pixel}; |
| 42 | use crate::write_tx_blocks; |
| 43 | use crate::write_tx_tree; |
| 44 | use crate::Tune; |
| 45 | use crate::{encode_block_post_cdef, encode_block_pre_cdef}; |
| 46 | |
| 47 | use arrayvec::*; |
| 48 | use itertools::izip; |
| 49 | use std::fmt; |
| 50 | use std::mem::MaybeUninit; |
| 51 | |
| 52 | #[derive (Copy, Clone, PartialEq, Eq)] |
| 53 | pub enum RDOType { |
| 54 | PixelDistRealRate, |
| 55 | TxDistRealRate, |
| 56 | TxDistEstRate, |
| 57 | } |
| 58 | |
| 59 | impl RDOType { |
| 60 | #[inline ] |
| 61 | pub const fn needs_tx_dist(self) -> bool { |
| 62 | match self { |
| 63 | // Pixel-domain distortion and exact ec rate |
| 64 | RDOType::PixelDistRealRate => false, |
| 65 | // Tx-domain distortion and exact ec rate |
| 66 | RDOType::TxDistRealRate => true, |
| 67 | // Tx-domain distortion and txdist-based rate |
| 68 | RDOType::TxDistEstRate => true, |
| 69 | } |
| 70 | } |
| 71 | #[inline ] |
| 72 | pub const fn needs_coeff_rate(self) -> bool { |
| 73 | match self { |
| 74 | RDOType::PixelDistRealRate => true, |
| 75 | RDOType::TxDistRealRate => true, |
| 76 | RDOType::TxDistEstRate => false, |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | #[derive (Clone)] |
| 82 | pub struct PartitionGroupParameters { |
| 83 | pub rd_cost: f64, |
| 84 | pub part_type: PartitionType, |
| 85 | pub part_modes: ArrayVec<PartitionParameters, 4>, |
| 86 | } |
| 87 | |
| 88 | #[derive (Clone, Debug)] |
| 89 | pub struct PartitionParameters { |
| 90 | pub rd_cost: f64, |
| 91 | pub bo: TileBlockOffset, |
| 92 | pub bsize: BlockSize, |
| 93 | pub pred_mode_luma: PredictionMode, |
| 94 | pub pred_mode_chroma: PredictionMode, |
| 95 | pub pred_cfl_params: CFLParams, |
| 96 | pub angle_delta: AngleDelta, |
| 97 | pub ref_frames: [RefType; 2], |
| 98 | pub mvs: [MotionVector; 2], |
| 99 | pub skip: bool, |
| 100 | pub has_coeff: bool, |
| 101 | pub tx_size: TxSize, |
| 102 | pub tx_type: TxType, |
| 103 | pub sidx: u8, |
| 104 | } |
| 105 | |
| 106 | impl Default for PartitionParameters { |
| 107 | fn default() -> Self { |
| 108 | PartitionParameters { |
| 109 | rd_cost: std::f64::MAX, |
| 110 | bo: TileBlockOffset::default(), |
| 111 | bsize: BlockSize::BLOCK_32X32, |
| 112 | pred_mode_luma: PredictionMode::default(), |
| 113 | pred_mode_chroma: PredictionMode::default(), |
| 114 | pred_cfl_params: CFLParams::default(), |
| 115 | angle_delta: AngleDelta::default(), |
| 116 | ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME], |
| 117 | mvs: [MotionVector::default(); 2], |
| 118 | skip: false, |
| 119 | has_coeff: true, |
| 120 | tx_size: TxSize::TX_4X4, |
| 121 | tx_type: TxType::DCT_DCT, |
| 122 | sidx: 0, |
| 123 | } |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 { |
| 128 | let bs_index: usize = ts as usize; |
| 129 | let q_bin_idx: usize = (qindex as usize) / RDO_QUANT_DIV; |
| 130 | let bin_idx_down: u64 = |
| 131 | ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64); |
| 132 | let bin_idx_up: u64 = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64); |
| 133 | let x0: i64 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64; |
| 134 | let x1: i64 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64; |
| 135 | let y0: i64 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64; |
| 136 | let y1: i64 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64; |
| 137 | let slope: i64 = ((y1 - y0) << 8) / (x1 - x0); |
| 138 | (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64 |
| 139 | } |
| 140 | |
| 141 | #[allow (unused)] |
| 142 | pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( |
| 143 | src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, |
| 144 | bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel, |
| 145 | ) -> Distortion { |
| 146 | debug_assert!(src1.plane_cfg.xdec == 0); |
| 147 | debug_assert!(src1.plane_cfg.ydec == 0); |
| 148 | debug_assert!(src2.plane_cfg.xdec == 0); |
| 149 | debug_assert!(src2.plane_cfg.ydec == 0); |
| 150 | |
| 151 | let mut sum = Distortion::zero(); |
| 152 | for y in (0..h).step_by(8) { |
| 153 | for x in (0..w).step_by(8) { |
| 154 | let kernel_h = (h - y).min(8); |
| 155 | let kernel_w = (w - x).min(8); |
| 156 | let area = Area::StartingAt { x: x as isize, y: y as isize }; |
| 157 | |
| 158 | let value = RawDistortion(cdef_dist_kernel( |
| 159 | &src1.subregion(area), |
| 160 | &src2.subregion(area), |
| 161 | kernel_w, |
| 162 | kernel_h, |
| 163 | bit_depth, |
| 164 | cpu, |
| 165 | ) as u64); |
| 166 | |
| 167 | // cdef is always called on non-subsampled planes, so BLOCK_8X8 is |
| 168 | // correct here. |
| 169 | sum += value * compute_bias(area, BlockSize::BLOCK_8X8); |
| 170 | } |
| 171 | } |
| 172 | sum |
| 173 | } |
| 174 | |
| 175 | /// Sum of Squared Error for a wxh block |
| 176 | /// Currently limited to w and h of valid blocks |
| 177 | pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>( |
| 178 | src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, |
| 179 | compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, |
| 180 | ) -> Distortion { |
| 181 | // See get_weighted_sse in src/dist.rs. |
| 182 | // Provide a scale to get_weighted_sse for each square region of this size. |
| 183 | const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1; |
| 184 | |
| 185 | // To bias the distortion correctly, compute it in blocks up to the size |
| 186 | // importance block size in a non-subsampled plane. |
| 187 | let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec; |
| 188 | let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec; |
| 189 | |
| 190 | let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h); |
| 191 | |
| 192 | let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE; |
| 193 | let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE; |
| 194 | |
| 195 | // TODO: Copying biases into a buffer is slow. It would be best if biases were |
| 196 | // passed directly. To do this, we would need different versions of the |
| 197 | // weighted sse function for decimated/subsampled data. Also requires |
| 198 | // eliminating use of unbiased sse. |
| 199 | // It should also be noted that the current copy code does not auto-vectorize. |
| 200 | |
| 201 | // Copy biases into a buffer. |
| 202 | let mut buf_storage = Aligned::new( |
| 203 | [MaybeUninit::<u32>::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE], |
| 204 | ); |
| 205 | let buf_stride = n_imp_blocks_w.next_power_of_two(); |
| 206 | let buf = init_slice_repeat_mut( |
| 207 | &mut buf_storage.data[..buf_stride * n_imp_blocks_h], |
| 208 | 0, |
| 209 | ); |
| 210 | |
| 211 | for block_y in 0..n_imp_blocks_h { |
| 212 | for block_x in 0..n_imp_blocks_w { |
| 213 | let block = Area::StartingAt { |
| 214 | x: (block_x * CHUNK_SIZE) as isize, |
| 215 | y: (block_y * CHUNK_SIZE) as isize, |
| 216 | }; |
| 217 | buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0; |
| 218 | } |
| 219 | } |
| 220 | |
| 221 | Distortion(get_weighted_sse( |
| 222 | src1, src2, buf, buf_stride, w, h, bit_depth, cpu, |
| 223 | )) |
| 224 | } |
| 225 | |
| 226 | pub const fn clip_visible_bsize( |
| 227 | frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize, |
| 228 | ) -> (usize, usize) { |
| 229 | let blk_w: usize = bsize.width(); |
| 230 | let blk_h: usize = bsize.height(); |
| 231 | |
| 232 | let visible_w: usize = if x + blk_w <= frame_w { |
| 233 | blk_w |
| 234 | } else if x >= frame_w { |
| 235 | 0 |
| 236 | } else { |
| 237 | frame_w - x |
| 238 | }; |
| 239 | |
| 240 | let visible_h: usize = if y + blk_h <= frame_h { |
| 241 | blk_h |
| 242 | } else if y >= frame_h { |
| 243 | 0 |
| 244 | } else { |
| 245 | frame_h - y |
| 246 | }; |
| 247 | |
| 248 | (visible_w, visible_h) |
| 249 | } |
| 250 | |
| 251 | // Compute the pixel-domain distortion for an encode |
| 252 | fn compute_distortion<T: Pixel>( |
| 253 | fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, |
| 254 | is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, |
| 255 | ) -> ScaledDistortion { |
| 256 | let area = Area::BlockStartingAt { bo: tile_bo.0 }; |
| 257 | let input_region = ts.input_tile.planes[0].subregion(area); |
| 258 | let rec_region = ts.rec.planes[0].subregion(area); |
| 259 | |
| 260 | // clip a block to have visible pixles only |
| 261 | let frame_bo = ts.to_frame_block_offset(tile_bo); |
| 262 | let (visible_w, visible_h) = clip_visible_bsize( |
| 263 | fi.width, |
| 264 | fi.height, |
| 265 | bsize, |
| 266 | frame_bo.0.x << MI_SIZE_LOG2, |
| 267 | frame_bo.0.y << MI_SIZE_LOG2, |
| 268 | ); |
| 269 | |
| 270 | if visible_w == 0 || visible_h == 0 { |
| 271 | return ScaledDistortion::zero(); |
| 272 | } |
| 273 | |
| 274 | let mut distortion = match fi.config.tune { |
| 275 | Tune::Psychovisual => cdef_dist_wxh( |
| 276 | &input_region, |
| 277 | &rec_region, |
| 278 | visible_w, |
| 279 | visible_h, |
| 280 | fi.sequence.bit_depth, |
| 281 | |bias_area, bsize| { |
| 282 | distortion_scale( |
| 283 | fi, |
| 284 | input_region.subregion(bias_area).frame_block_offset(), |
| 285 | bsize, |
| 286 | ) |
| 287 | }, |
| 288 | fi.cpu_feature_level, |
| 289 | ), |
| 290 | Tune::Psnr => sse_wxh( |
| 291 | &input_region, |
| 292 | &rec_region, |
| 293 | visible_w, |
| 294 | visible_h, |
| 295 | |bias_area, bsize| { |
| 296 | distortion_scale( |
| 297 | fi, |
| 298 | input_region.subregion(bias_area).frame_block_offset(), |
| 299 | bsize, |
| 300 | ) |
| 301 | }, |
| 302 | fi.sequence.bit_depth, |
| 303 | fi.cpu_feature_level, |
| 304 | ), |
| 305 | } * fi.dist_scale[0]; |
| 306 | |
| 307 | if is_chroma_block |
| 308 | && !luma_only |
| 309 | && fi.sequence.chroma_sampling != ChromaSampling::Cs400 |
| 310 | { |
| 311 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 312 | let chroma_w = if bsize.width() >= 8 || xdec == 0 { |
| 313 | (visible_w + xdec) >> xdec |
| 314 | } else { |
| 315 | (4 + visible_w + xdec) >> xdec |
| 316 | }; |
| 317 | let chroma_h = if bsize.height() >= 8 || ydec == 0 { |
| 318 | (visible_h + ydec) >> ydec |
| 319 | } else { |
| 320 | (4 + visible_h + ydec) >> ydec |
| 321 | }; |
| 322 | |
| 323 | for p in 1..3 { |
| 324 | let input_region = ts.input_tile.planes[p].subregion(area); |
| 325 | let rec_region = ts.rec.planes[p].subregion(area); |
| 326 | distortion += sse_wxh( |
| 327 | &input_region, |
| 328 | &rec_region, |
| 329 | chroma_w, |
| 330 | chroma_h, |
| 331 | |bias_area, bsize| { |
| 332 | distortion_scale( |
| 333 | fi, |
| 334 | input_region.subregion(bias_area).frame_block_offset(), |
| 335 | bsize, |
| 336 | ) |
| 337 | }, |
| 338 | fi.sequence.bit_depth, |
| 339 | fi.cpu_feature_level, |
| 340 | ) * fi.dist_scale[p]; |
| 341 | } |
| 342 | } |
| 343 | distortion |
| 344 | } |
| 345 | |
| 346 | // Compute the transform-domain distortion for an encode |
| 347 | fn compute_tx_distortion<T: Pixel>( |
| 348 | fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize, |
| 349 | is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, |
| 350 | skip: bool, luma_only: bool, |
| 351 | ) -> ScaledDistortion { |
| 352 | assert!(fi.config.tune == Tune::Psnr); |
| 353 | let area = Area::BlockStartingAt { bo: tile_bo.0 }; |
| 354 | let input_region = ts.input_tile.planes[0].subregion(area); |
| 355 | let rec_region = ts.rec.planes[0].subregion(area); |
| 356 | |
| 357 | let (visible_w, visible_h) = if !skip { |
| 358 | (bsize.width(), bsize.height()) |
| 359 | } else { |
| 360 | let frame_bo = ts.to_frame_block_offset(tile_bo); |
| 361 | clip_visible_bsize( |
| 362 | fi.width, |
| 363 | fi.height, |
| 364 | bsize, |
| 365 | frame_bo.0.x << MI_SIZE_LOG2, |
| 366 | frame_bo.0.y << MI_SIZE_LOG2, |
| 367 | ) |
| 368 | }; |
| 369 | |
| 370 | if visible_w == 0 || visible_h == 0 { |
| 371 | return ScaledDistortion::zero(); |
| 372 | } |
| 373 | |
| 374 | let mut distortion = if skip { |
| 375 | sse_wxh( |
| 376 | &input_region, |
| 377 | &rec_region, |
| 378 | visible_w, |
| 379 | visible_h, |
| 380 | |bias_area, bsize| { |
| 381 | distortion_scale( |
| 382 | fi, |
| 383 | input_region.subregion(bias_area).frame_block_offset(), |
| 384 | bsize, |
| 385 | ) |
| 386 | }, |
| 387 | fi.sequence.bit_depth, |
| 388 | fi.cpu_feature_level, |
| 389 | ) * fi.dist_scale[0] |
| 390 | } else { |
| 391 | tx_dist |
| 392 | }; |
| 393 | |
| 394 | if is_chroma_block |
| 395 | && !luma_only |
| 396 | && skip |
| 397 | && fi.sequence.chroma_sampling != ChromaSampling::Cs400 |
| 398 | { |
| 399 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 400 | let chroma_w = if bsize.width() >= 8 || xdec == 0 { |
| 401 | (visible_w + xdec) >> xdec |
| 402 | } else { |
| 403 | (4 + visible_w + xdec) >> xdec |
| 404 | }; |
| 405 | let chroma_h = if bsize.height() >= 8 || ydec == 0 { |
| 406 | (visible_h + ydec) >> ydec |
| 407 | } else { |
| 408 | (4 + visible_h + ydec) >> ydec |
| 409 | }; |
| 410 | |
| 411 | for p in 1..3 { |
| 412 | let input_region = ts.input_tile.planes[p].subregion(area); |
| 413 | let rec_region = ts.rec.planes[p].subregion(area); |
| 414 | distortion += sse_wxh( |
| 415 | &input_region, |
| 416 | &rec_region, |
| 417 | chroma_w, |
| 418 | chroma_h, |
| 419 | |bias_area, bsize| { |
| 420 | distortion_scale( |
| 421 | fi, |
| 422 | input_region.subregion(bias_area).frame_block_offset(), |
| 423 | bsize, |
| 424 | ) |
| 425 | }, |
| 426 | fi.sequence.bit_depth, |
| 427 | fi.cpu_feature_level, |
| 428 | ) * fi.dist_scale[p]; |
| 429 | } |
| 430 | } |
| 431 | distortion |
| 432 | } |
| 433 | |
| 434 | /// Compute a scaling factor to multiply the distortion of a block by, |
| 435 | /// this factor is determined using temporal RDO. |
| 436 | /// |
| 437 | /// # Panics |
| 438 | /// |
| 439 | /// - If called with `bsize` of 8x8 or smaller |
| 440 | /// - If the coded frame data doesn't exist on the `FrameInvariants` |
| 441 | pub fn distortion_scale<T: Pixel>( |
| 442 | fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, |
| 443 | ) -> DistortionScale { |
| 444 | if !fi.config.temporal_rdo() { |
| 445 | return DistortionScale::default(); |
| 446 | } |
| 447 | // EncoderConfig::temporal_rdo() should always return false in situations |
| 448 | // where distortion is computed on > 8x8 blocks, so we should never hit this |
| 449 | // assert. |
| 450 | assert!(bsize <= BlockSize::BLOCK_8X8); |
| 451 | |
| 452 | let x: usize = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; |
| 453 | let y: usize = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; |
| 454 | |
| 455 | let coded_data: &CodedFrameData = fi.coded_frame_data.as_ref().unwrap(); |
| 456 | coded_data.distortion_scales[y * coded_data.w_in_imp_b + x] |
| 457 | } |
| 458 | |
| 459 | /// # Panics |
| 460 | /// |
| 461 | /// - If the coded frame data doesn't exist on the `FrameInvariants` |
| 462 | pub fn spatiotemporal_scale<T: Pixel>( |
| 463 | fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize, |
| 464 | ) -> DistortionScale { |
| 465 | if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual { |
| 466 | return DistortionScale::default(); |
| 467 | } |
| 468 | |
| 469 | let coded_data = fi.coded_frame_data.as_ref().unwrap(); |
| 470 | |
| 471 | let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; |
| 472 | let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; |
| 473 | let x1 = (x0 + bsize.width_imp_b()).min(coded_data.w_in_imp_b); |
| 474 | let y1 = (y0 + bsize.height_imp_b()).min(coded_data.h_in_imp_b); |
| 475 | let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT; |
| 476 | |
| 477 | // calling this on each slice individually improves autovectorization |
| 478 | // compared to using `Iterator::take` |
| 479 | #[inline (always)] |
| 480 | fn take_slice<T>(slice: &[T], n: usize) -> &[T] { |
| 481 | slice.get(..n).unwrap_or(slice) |
| 482 | } |
| 483 | |
| 484 | let mut sum = 0; |
| 485 | for y in y0..y1 { |
| 486 | sum += take_slice( |
| 487 | &coded_data.distortion_scales[y * coded_data.w_in_imp_b..][x0..x1], |
| 488 | MAX_SB_IN_IMP_B, |
| 489 | ) |
| 490 | .iter() |
| 491 | .zip( |
| 492 | take_slice( |
| 493 | &coded_data.activity_scales[y * coded_data.w_in_imp_b..][x0..x1], |
| 494 | MAX_SB_IN_IMP_B, |
| 495 | ) |
| 496 | .iter(), |
| 497 | ) |
| 498 | .map(|(d, a)| d.0 as u64 * a.0 as u64) |
| 499 | .sum::<u64>(); |
| 500 | } |
| 501 | DistortionScale(((sum + (den >> 1)) / den) as u32) |
| 502 | } |
| 503 | |
| 504 | pub fn distortion_scale_for( |
| 505 | propagate_cost: f64, intra_cost: f64, |
| 506 | ) -> DistortionScale { |
| 507 | // The mbtree paper \cite{mbtree} uses the following formula: |
| 508 | // |
| 509 | // QP_delta = -strength * log2(1 + (propagate_cost / intra_cost)) |
| 510 | // |
| 511 | // Since this is H.264, this corresponds to the following quantizer: |
| 512 | // |
| 513 | // Q' = Q * 2^(QP_delta/6) |
| 514 | // |
| 515 | // Since lambda is proportial to Q^2, this means we want to minimize: |
| 516 | // |
| 517 | // D + lambda' * R |
| 518 | // = D + 2^(QP_delta / 3) * lambda * R |
| 519 | // |
| 520 | // If we want to keep lambda fixed, we can instead scale distortion and |
| 521 | // minimize: |
| 522 | // |
| 523 | // D * scale + lambda * R |
| 524 | // |
| 525 | // where: |
| 526 | // |
| 527 | // scale = 2^(QP_delta / -3) |
| 528 | // = (1 + (propagate_cost / intra_cost))^(strength / 3) |
| 529 | // |
| 530 | // The original paper empirically chooses strength = 2.0, but strength = 1.0 |
| 531 | // seems to work best in rav1e currently, this may have something to do with |
| 532 | // the fact that they use 16x16 blocks whereas our "importance blocks" are |
| 533 | // 8x8, but everything should be scale invariant here so that's weird. |
| 534 | // |
| 535 | // @article{mbtree, |
| 536 | // title={A novel macroblock-tree algorithm for high-performance |
| 537 | // optimization of dependent video coding in H.264/AVC}, |
| 538 | // author={Garrett-Glaser, Jason}, |
| 539 | // journal={Tech. Rep.}, |
| 540 | // year={2009}, |
| 541 | // url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf} |
| 542 | // } |
| 543 | |
| 544 | if intra_cost == 0. { |
| 545 | return DistortionScale::default(); // no scaling |
| 546 | } |
| 547 | |
| 548 | let strength = 1.0; // empirical, see comment above |
| 549 | let frac = (intra_cost + propagate_cost) / intra_cost; |
| 550 | frac.powf(strength / 3.0).into() |
| 551 | } |
| 552 | |
| 553 | /// Fixed point arithmetic version of distortion scale |
| 554 | #[repr (transparent)] |
| 555 | #[derive (Copy, Clone)] |
| 556 | pub struct DistortionScale(pub u32); |
| 557 | |
| 558 | #[repr (transparent)] |
| 559 | pub struct RawDistortion(u64); |
| 560 | |
| 561 | #[repr (transparent)] |
| 562 | pub struct Distortion(pub u64); |
| 563 | |
| 564 | #[repr (transparent)] |
| 565 | pub struct ScaledDistortion(u64); |
| 566 | |
| 567 | impl DistortionScale { |
| 568 | /// Bits past the radix point |
| 569 | const SHIFT: u32 = 14; |
| 570 | /// Number of bits used. Determines the max value. |
| 571 | /// 28 bits is quite excessive. |
| 572 | const BITS: u32 = 28; |
| 573 | /// Maximum internal value |
| 574 | const MAX: u64 = (1 << Self::BITS) - 1; |
| 575 | |
| 576 | #[inline ] |
| 577 | pub const fn new(num: u64, den: u64) -> Self { |
| 578 | let raw = (num << Self::SHIFT).saturating_add(den / 2) / den; |
| 579 | let mask = (raw <= Self::MAX) as u64; |
| 580 | Self((mask * raw + (1 - mask) * Self::MAX) as u32) |
| 581 | } |
| 582 | |
| 583 | pub fn inv_mean(slice: &[Self]) -> Self { |
| 584 | use crate::util::{bexp64, blog32_q11}; |
| 585 | let sum = slice.iter().map(|&s| blog32_q11(s.0) as i64).sum::<i64>(); |
| 586 | let log_inv_mean_q11 = |
| 587 | (Self::SHIFT << 11) as i64 - sum / slice.len() as i64; |
| 588 | Self( |
| 589 | bexp64((log_inv_mean_q11 + (Self::SHIFT << 11) as i64) << (57 - 11)) |
| 590 | .clamp(1, (1 << Self::BITS) - 1) as u32, |
| 591 | ) |
| 592 | } |
| 593 | |
| 594 | /// Binary logarithm in Q11 |
| 595 | #[inline ] |
| 596 | pub const fn blog16(self) -> i16 { |
| 597 | use crate::util::blog32_q11; |
| 598 | (blog32_q11(self.0) - ((Self::SHIFT as i32) << 11)) as i16 |
| 599 | } |
| 600 | |
| 601 | /// Binary logarithm in Q57 |
| 602 | #[inline ] |
| 603 | pub const fn blog64(self) -> i64 { |
| 604 | use crate::util::{blog64, q57}; |
| 605 | blog64(self.0 as i64) - q57(Self::SHIFT as i32) |
| 606 | } |
| 607 | |
| 608 | /// Multiply, round and shift |
| 609 | /// Internal implementation, so don't use multiply trait. |
| 610 | #[inline ] |
| 611 | pub const fn mul_u64(self, dist: u64) -> u64 { |
| 612 | (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | impl std::ops::Mul for DistortionScale { |
| 617 | type Output = Self; |
| 618 | |
| 619 | /// Multiply, round and shift |
| 620 | #[inline ] |
| 621 | fn mul(self, rhs: Self) -> Self { |
| 622 | Self( |
| 623 | (((self.0 as u64 * rhs.0 as u64) + (1 << (Self::SHIFT - 1))) |
| 624 | >> Self::SHIFT) |
| 625 | .clamp(min:1, (1 << Self::BITS) - 1) as u32, |
| 626 | ) |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | impl std::ops::MulAssign for DistortionScale { |
| 631 | fn mul_assign(&mut self, rhs: Self) { |
| 632 | *self = *self * rhs; |
| 633 | } |
| 634 | } |
| 635 | |
| 636 | // Default value for DistortionScale is a fixed point 1 |
| 637 | impl Default for DistortionScale { |
| 638 | #[inline ] |
| 639 | fn default() -> Self { |
| 640 | Self(1 << Self::SHIFT) |
| 641 | } |
| 642 | } |
| 643 | |
| 644 | impl fmt::Debug for DistortionScale { |
| 645 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 646 | write!(f, " {}" , f64::from(*self)) |
| 647 | } |
| 648 | } |
| 649 | |
| 650 | impl From<f64> for DistortionScale { |
| 651 | #[inline ] |
| 652 | fn from(scale: f64) -> Self { |
| 653 | let den: u64 = 1 << (Self::SHIFT + 1); |
| 654 | Self::new((scale * den as f64) as u64, den) |
| 655 | } |
| 656 | } |
| 657 | |
| 658 | impl From<DistortionScale> for f64 { |
| 659 | #[inline ] |
| 660 | fn from(scale: DistortionScale) -> Self { |
| 661 | scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64 |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | impl RawDistortion { |
| 666 | #[inline ] |
| 667 | pub const fn new(dist: u64) -> Self { |
| 668 | Self(dist) |
| 669 | } |
| 670 | } |
| 671 | |
| 672 | impl std::ops::Mul<DistortionScale> for RawDistortion { |
| 673 | type Output = Distortion; |
| 674 | #[inline ] |
| 675 | fn mul(self, rhs: DistortionScale) -> Distortion { |
| 676 | Distortion(rhs.mul_u64(self.0)) |
| 677 | } |
| 678 | } |
| 679 | |
| 680 | impl Distortion { |
| 681 | #[inline ] |
| 682 | pub const fn zero() -> Self { |
| 683 | Self(0) |
| 684 | } |
| 685 | } |
| 686 | |
| 687 | impl std::ops::Mul<DistortionScale> for Distortion { |
| 688 | type Output = ScaledDistortion; |
| 689 | #[inline ] |
| 690 | fn mul(self, rhs: DistortionScale) -> ScaledDistortion { |
| 691 | ScaledDistortion(rhs.mul_u64(self.0)) |
| 692 | } |
| 693 | } |
| 694 | |
| 695 | impl std::ops::AddAssign for Distortion { |
| 696 | #[inline ] |
| 697 | fn add_assign(&mut self, other: Self) { |
| 698 | self.0 += other.0; |
| 699 | } |
| 700 | } |
| 701 | |
| 702 | impl ScaledDistortion { |
| 703 | #[inline ] |
| 704 | pub const fn zero() -> Self { |
| 705 | Self(0) |
| 706 | } |
| 707 | } |
| 708 | |
| 709 | impl std::ops::AddAssign for ScaledDistortion { |
| 710 | #[inline ] |
| 711 | fn add_assign(&mut self, other: Self) { |
| 712 | self.0 += other.0; |
| 713 | } |
| 714 | } |
| 715 | |
| 716 | pub fn compute_rd_cost<T: Pixel>( |
| 717 | fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion, |
| 718 | ) -> f64 { |
| 719 | let rate_in_bits: f64 = (rate as f64) / ((1 << OD_BITRES) as f64); |
| 720 | fi.lambda.mul_add(a:rate_in_bits, b:distortion.0 as f64) |
| 721 | } |
| 722 | |
| 723 | pub fn rdo_tx_size_type<T: Pixel>( |
| 724 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 725 | cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, |
| 726 | luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], |
| 727 | skip: bool, |
| 728 | ) -> (TxSize, TxType) { |
| 729 | let is_inter = !luma_mode.is_intra(); |
| 730 | let mut tx_size = max_txsize_rect_lookup[bsize as usize]; |
| 731 | |
| 732 | if fi.enable_inter_txfm_split && is_inter && !skip { |
| 733 | tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size |
| 734 | } |
| 735 | |
| 736 | let mut best_tx_type = TxType::DCT_DCT; |
| 737 | let mut best_tx_size = tx_size; |
| 738 | let mut best_rd = std::f64::MAX; |
| 739 | |
| 740 | let do_rdo_tx_size = fi.tx_mode_select |
| 741 | && fi.config.speed_settings.transform.rdo_tx_decision |
| 742 | && !is_inter; |
| 743 | let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 }; |
| 744 | let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None; |
| 745 | |
| 746 | for _ in 0..=rdo_tx_depth { |
| 747 | let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set); |
| 748 | |
| 749 | let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY |
| 750 | && fi.config.speed_settings.transform.rdo_tx_decision |
| 751 | && !is_inter |
| 752 | && !skip; |
| 753 | |
| 754 | if !do_rdo_tx_size && !do_rdo_tx_type { |
| 755 | return (best_tx_size, best_tx_type); |
| 756 | }; |
| 757 | |
| 758 | let tx_types = |
| 759 | if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] }; |
| 760 | |
| 761 | // Luma plane transform type decision |
| 762 | let (tx_type, rd_cost) = rdo_tx_type_decision( |
| 763 | fi, |
| 764 | ts, |
| 765 | cw, |
| 766 | &mut cw_checkpoint, |
| 767 | luma_mode, |
| 768 | ref_frames, |
| 769 | mvs, |
| 770 | bsize, |
| 771 | tile_bo, |
| 772 | tx_size, |
| 773 | tx_set, |
| 774 | tx_types, |
| 775 | best_rd, |
| 776 | ); |
| 777 | |
| 778 | if rd_cost < best_rd { |
| 779 | best_tx_size = tx_size; |
| 780 | best_tx_type = tx_type; |
| 781 | best_rd = rd_cost; |
| 782 | } |
| 783 | |
| 784 | debug_assert!(tx_size.width_log2() <= bsize.width_log2()); |
| 785 | debug_assert!(tx_size.height_log2() <= bsize.height_log2()); |
| 786 | debug_assert!( |
| 787 | tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT |
| 788 | ); |
| 789 | |
| 790 | let next_tx_size = sub_tx_size_map[tx_size as usize]; |
| 791 | |
| 792 | if next_tx_size == tx_size { |
| 793 | break; |
| 794 | } else { |
| 795 | tx_size = next_tx_size; |
| 796 | }; |
| 797 | } |
| 798 | |
| 799 | (best_tx_size, best_tx_type) |
| 800 | } |
| 801 | |
| 802 | #[inline ] |
| 803 | const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool { |
| 804 | let diff_row: i32 = mv.row as i32 - ref_mv.row as i32; |
| 805 | let diff_col: i32 = mv.col as i32 - ref_mv.col as i32; |
| 806 | diff_row >= MV_LOW |
| 807 | && diff_row <= MV_UPP |
| 808 | && diff_col >= MV_LOW |
| 809 | && diff_col <= MV_UPP |
| 810 | } |
| 811 | |
| 812 | #[inline ] |
| 813 | #[profiling::function ] |
| 814 | fn luma_chroma_mode_rdo<T: Pixel>( |
| 815 | luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize, |
| 816 | tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, |
| 817 | cw: &mut ContextWriter, rdo_type: RDOType, |
| 818 | cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters, |
| 819 | mvs: [MotionVector; 2], ref_frames: [RefType; 2], |
| 820 | mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool, |
| 821 | mode_context: usize, mv_stack: &ArrayVec<CandidateMV, 9>, |
| 822 | angle_delta: AngleDelta, |
| 823 | ) { |
| 824 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 825 | |
| 826 | let is_chroma_block = |
| 827 | has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); |
| 828 | |
| 829 | if !luma_mode_is_intra { |
| 830 | let ref_mvs = if mv_stack.is_empty() { |
| 831 | [MotionVector::default(); 2] |
| 832 | } else { |
| 833 | [mv_stack[0].this_mv, mv_stack[0].comp_mv] |
| 834 | }; |
| 835 | |
| 836 | if (luma_mode == PredictionMode::NEWMV |
| 837 | || luma_mode == PredictionMode::NEW_NEWMV |
| 838 | || luma_mode == PredictionMode::NEW_NEARESTMV) |
| 839 | && !dmv_in_range(mvs[0], ref_mvs[0]) |
| 840 | { |
| 841 | return; |
| 842 | } |
| 843 | |
| 844 | if (luma_mode == PredictionMode::NEW_NEWMV |
| 845 | || luma_mode == PredictionMode::NEAREST_NEWMV) |
| 846 | && !dmv_in_range(mvs[1], ref_mvs[1]) |
| 847 | { |
| 848 | return; |
| 849 | } |
| 850 | } |
| 851 | |
| 852 | // Find the best chroma prediction mode for the current luma prediction mode |
| 853 | let mut chroma_rdo = |skip: bool| -> bool { |
| 854 | use crate::segmentation::select_segment; |
| 855 | |
| 856 | let mut zero_distortion = false; |
| 857 | |
| 858 | for sidx in select_segment(fi, ts, tile_bo, bsize, skip) { |
| 859 | cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx); |
| 860 | |
| 861 | let (tx_size, tx_type) = rdo_tx_size_type( |
| 862 | fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip, |
| 863 | ); |
| 864 | for &chroma_mode in mode_set_chroma.iter() { |
| 865 | let wr = &mut WriterCounter::new(); |
| 866 | let tell = wr.tell_frac(); |
| 867 | |
| 868 | if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() { |
| 869 | cw.write_partition( |
| 870 | wr, |
| 871 | tile_bo, |
| 872 | PartitionType::PARTITION_NONE, |
| 873 | bsize, |
| 874 | ); |
| 875 | } |
| 876 | |
| 877 | // TODO(yushin): luma and chroma would have different decision based on chroma format |
| 878 | let need_recon_pixel = |
| 879 | luma_mode_is_intra && tx_size.block_size() != bsize; |
| 880 | |
| 881 | encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip); |
| 882 | let (has_coeff, tx_dist) = encode_block_post_cdef( |
| 883 | fi, |
| 884 | ts, |
| 885 | cw, |
| 886 | wr, |
| 887 | luma_mode, |
| 888 | chroma_mode, |
| 889 | angle_delta, |
| 890 | ref_frames, |
| 891 | mvs, |
| 892 | bsize, |
| 893 | tile_bo, |
| 894 | skip, |
| 895 | CFLParams::default(), |
| 896 | tx_size, |
| 897 | tx_type, |
| 898 | mode_context, |
| 899 | mv_stack, |
| 900 | rdo_type, |
| 901 | need_recon_pixel, |
| 902 | None, |
| 903 | ); |
| 904 | |
| 905 | let rate = wr.tell_frac() - tell; |
| 906 | let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel { |
| 907 | compute_tx_distortion( |
| 908 | fi, |
| 909 | ts, |
| 910 | bsize, |
| 911 | is_chroma_block, |
| 912 | tile_bo, |
| 913 | tx_dist, |
| 914 | skip, |
| 915 | false, |
| 916 | ) |
| 917 | } else { |
| 918 | compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false) |
| 919 | }; |
| 920 | let is_zero_dist = distortion.0 == 0; |
| 921 | let rd = compute_rd_cost(fi, rate, distortion); |
| 922 | if rd < best.rd_cost { |
| 923 | //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV { |
| 924 | best.rd_cost = rd; |
| 925 | best.pred_mode_luma = luma_mode; |
| 926 | best.pred_mode_chroma = chroma_mode; |
| 927 | best.angle_delta = angle_delta; |
| 928 | best.ref_frames = ref_frames; |
| 929 | best.mvs = mvs; |
| 930 | best.skip = skip; |
| 931 | best.has_coeff = has_coeff; |
| 932 | best.tx_size = tx_size; |
| 933 | best.tx_type = tx_type; |
| 934 | best.sidx = sidx; |
| 935 | zero_distortion = is_zero_dist; |
| 936 | } |
| 937 | |
| 938 | cw.rollback(cw_checkpoint); |
| 939 | } |
| 940 | } |
| 941 | |
| 942 | zero_distortion |
| 943 | }; |
| 944 | |
| 945 | // Don't skip when using intra modes |
| 946 | let zero_distortion = |
| 947 | if !luma_mode_is_intra { chroma_rdo(true) } else { false }; |
| 948 | // early skip |
| 949 | if !zero_distortion { |
| 950 | chroma_rdo(false); |
| 951 | } |
| 952 | } |
| 953 | |
| 954 | /// RDO-based mode decision |
| 955 | /// |
| 956 | /// # Panics |
| 957 | /// |
| 958 | /// - If the best RD found is negative. |
| 959 | /// This should never happen and indicates a development error. |
| 960 | #[profiling::function ] |
| 961 | pub fn rdo_mode_decision<T: Pixel>( |
| 962 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 963 | cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, |
| 964 | inter_cfg: &InterConfig, |
| 965 | ) -> PartitionParameters { |
| 966 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 967 | let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); |
| 968 | |
| 969 | let rdo_type = if fi.use_tx_domain_rate { |
| 970 | RDOType::TxDistEstRate |
| 971 | } else if fi.use_tx_domain_distortion { |
| 972 | RDOType::TxDistRealRate |
| 973 | } else { |
| 974 | RDOType::PixelDistRealRate |
| 975 | }; |
| 976 | |
| 977 | let mut best = if fi.frame_type.has_inter() { |
| 978 | assert!(fi.frame_type != FrameType::KEY); |
| 979 | |
| 980 | inter_frame_rdo_mode_decision( |
| 981 | fi, |
| 982 | ts, |
| 983 | cw, |
| 984 | bsize, |
| 985 | tile_bo, |
| 986 | inter_cfg, |
| 987 | &cw_checkpoint, |
| 988 | rdo_type, |
| 989 | ) |
| 990 | } else { |
| 991 | PartitionParameters::default() |
| 992 | }; |
| 993 | |
| 994 | let is_chroma_block = |
| 995 | has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); |
| 996 | |
| 997 | if !best.skip { |
| 998 | best = intra_frame_rdo_mode_decision( |
| 999 | fi, |
| 1000 | ts, |
| 1001 | cw, |
| 1002 | bsize, |
| 1003 | tile_bo, |
| 1004 | &cw_checkpoint, |
| 1005 | rdo_type, |
| 1006 | best, |
| 1007 | is_chroma_block, |
| 1008 | ); |
| 1009 | } |
| 1010 | |
| 1011 | if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() { |
| 1012 | cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx); |
| 1013 | |
| 1014 | let chroma_mode = PredictionMode::UV_CFL_PRED; |
| 1015 | let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); |
| 1016 | let mut wr = WriterCounter::new(); |
| 1017 | let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 }; |
| 1018 | |
| 1019 | write_tx_blocks( |
| 1020 | fi, |
| 1021 | ts, |
| 1022 | cw, |
| 1023 | &mut wr, |
| 1024 | best.pred_mode_luma, |
| 1025 | best.pred_mode_luma, |
| 1026 | angle_delta, |
| 1027 | tile_bo, |
| 1028 | bsize, |
| 1029 | best.tx_size, |
| 1030 | best.tx_type, |
| 1031 | false, |
| 1032 | CFLParams::default(), |
| 1033 | true, |
| 1034 | rdo_type, |
| 1035 | true, |
| 1036 | ); |
| 1037 | cw.rollback(&cw_checkpoint); |
| 1038 | if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { |
| 1039 | if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) { |
| 1040 | let mut wr = WriterCounter::new(); |
| 1041 | let tell = wr.tell_frac(); |
| 1042 | |
| 1043 | encode_block_pre_cdef( |
| 1044 | &fi.sequence, |
| 1045 | ts, |
| 1046 | cw, |
| 1047 | &mut wr, |
| 1048 | bsize, |
| 1049 | tile_bo, |
| 1050 | best.skip, |
| 1051 | ); |
| 1052 | let (has_coeff, _) = encode_block_post_cdef( |
| 1053 | fi, |
| 1054 | ts, |
| 1055 | cw, |
| 1056 | &mut wr, |
| 1057 | best.pred_mode_luma, |
| 1058 | chroma_mode, |
| 1059 | angle_delta, |
| 1060 | best.ref_frames, |
| 1061 | best.mvs, |
| 1062 | bsize, |
| 1063 | tile_bo, |
| 1064 | best.skip, |
| 1065 | cfl, |
| 1066 | best.tx_size, |
| 1067 | best.tx_type, |
| 1068 | 0, |
| 1069 | &[], |
| 1070 | rdo_type, |
| 1071 | true, // For CFL, luma should be always reconstructed. |
| 1072 | None, |
| 1073 | ); |
| 1074 | |
| 1075 | let rate = wr.tell_frac() - tell; |
| 1076 | |
| 1077 | // For CFL, tx-domain distortion is not an option. |
| 1078 | let distortion = |
| 1079 | compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false); |
| 1080 | let rd = compute_rd_cost(fi, rate, distortion); |
| 1081 | if rd < best.rd_cost { |
| 1082 | best.rd_cost = rd; |
| 1083 | best.pred_mode_chroma = chroma_mode; |
| 1084 | best.angle_delta = angle_delta; |
| 1085 | best.has_coeff = has_coeff; |
| 1086 | best.pred_cfl_params = cfl; |
| 1087 | } |
| 1088 | |
| 1089 | cw.rollback(&cw_checkpoint); |
| 1090 | } |
| 1091 | } |
| 1092 | } |
| 1093 | |
| 1094 | cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma); |
| 1095 | cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames); |
| 1096 | cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs); |
| 1097 | |
| 1098 | assert!(best.rd_cost >= 0_f64); |
| 1099 | |
| 1100 | PartitionParameters { |
| 1101 | bo: tile_bo, |
| 1102 | bsize, |
| 1103 | pred_mode_luma: best.pred_mode_luma, |
| 1104 | pred_mode_chroma: best.pred_mode_chroma, |
| 1105 | pred_cfl_params: best.pred_cfl_params, |
| 1106 | angle_delta: best.angle_delta, |
| 1107 | ref_frames: best.ref_frames, |
| 1108 | mvs: best.mvs, |
| 1109 | rd_cost: best.rd_cost, |
| 1110 | skip: best.skip, |
| 1111 | has_coeff: best.has_coeff, |
| 1112 | tx_size: best.tx_size, |
| 1113 | tx_type: best.tx_type, |
| 1114 | sidx: best.sidx, |
| 1115 | } |
| 1116 | } |
| 1117 | |
| 1118 | #[profiling::function ] |
| 1119 | fn inter_frame_rdo_mode_decision<T: Pixel>( |
| 1120 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1121 | cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, |
| 1122 | inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, |
| 1123 | rdo_type: RDOType, |
| 1124 | ) -> PartitionParameters { |
| 1125 | let mut best = PartitionParameters::default(); |
| 1126 | |
| 1127 | // we can never have more than 7 reference frame sets |
| 1128 | let mut ref_frames_set = ArrayVec::<_, 7>::new(); |
| 1129 | // again, max of 7 ref slots |
| 1130 | let mut ref_slot_set = ArrayVec::<_, 7>::new(); |
| 1131 | // our implementation never returns more than 3 at the moment |
| 1132 | let mut mvs_from_me = ArrayVec::<_, 3>::new(); |
| 1133 | let mut fwdref = None; |
| 1134 | let mut bwdref = None; |
| 1135 | |
| 1136 | for i in inter_cfg.allowed_ref_frames().iter().copied() { |
| 1137 | // Don't search LAST3 since it's used only for probs |
| 1138 | if i == LAST3_FRAME { |
| 1139 | continue; |
| 1140 | } |
| 1141 | |
| 1142 | if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) { |
| 1143 | if fwdref.is_none() && i.is_fwd_ref() { |
| 1144 | fwdref = Some(ref_frames_set.len()); |
| 1145 | } |
| 1146 | if bwdref.is_none() && i.is_bwd_ref() { |
| 1147 | bwdref = Some(ref_frames_set.len()); |
| 1148 | } |
| 1149 | ref_frames_set.push([i, NONE_FRAME]); |
| 1150 | let slot_idx = fi.ref_frames[i.to_index()]; |
| 1151 | ref_slot_set.push(slot_idx); |
| 1152 | } |
| 1153 | } |
| 1154 | assert!(!ref_frames_set.is_empty()); |
| 1155 | |
| 1156 | let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new(); |
| 1157 | let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new(); |
| 1158 | let mut satds = ArrayVec::<u32, 20>::new(); |
| 1159 | let mut mv_stacks = ArrayVec::<_, 20>::new(); |
| 1160 | let mut mode_contexts = ArrayVec::<_, 7>::new(); |
| 1161 | |
| 1162 | for (i, &ref_frames) in ref_frames_set.iter().enumerate() { |
| 1163 | let mut mv_stack = ArrayVec::<CandidateMV, 9>::new(); |
| 1164 | mode_contexts.push(cw.find_mvrefs( |
| 1165 | tile_bo, |
| 1166 | ref_frames, |
| 1167 | &mut mv_stack, |
| 1168 | bsize, |
| 1169 | fi, |
| 1170 | false, |
| 1171 | )); |
| 1172 | |
| 1173 | let mut pmv = [MotionVector::default(); 2]; |
| 1174 | if !mv_stack.is_empty() { |
| 1175 | pmv[0] = mv_stack[0].this_mv; |
| 1176 | } |
| 1177 | if mv_stack.len() > 1 { |
| 1178 | pmv[1] = mv_stack[1].this_mv; |
| 1179 | } |
| 1180 | |
| 1181 | let res = estimate_motion( |
| 1182 | fi, |
| 1183 | ts, |
| 1184 | bsize.width(), |
| 1185 | bsize.height(), |
| 1186 | tile_bo, |
| 1187 | ref_frames[0], |
| 1188 | Some(pmv), |
| 1189 | MVSamplingMode::CORNER { right: true, bottom: true }, |
| 1190 | false, |
| 1191 | 0, |
| 1192 | None, |
| 1193 | ) |
| 1194 | .unwrap_or_else(MotionSearchResult::empty); |
| 1195 | let b_me = res.mv; |
| 1196 | |
| 1197 | mvs_from_me.push([b_me, MotionVector::default()]); |
| 1198 | |
| 1199 | for &x in RAV1E_INTER_MODES_MINIMAL { |
| 1200 | inter_mode_set.push((x, i)); |
| 1201 | } |
| 1202 | if !mv_stack.is_empty() { |
| 1203 | inter_mode_set.push((PredictionMode::NEAR0MV, i)); |
| 1204 | } |
| 1205 | if mv_stack.len() >= 2 { |
| 1206 | inter_mode_set.push((PredictionMode::GLOBALMV, i)); |
| 1207 | } |
| 1208 | let include_near_mvs = fi.config.speed_settings.motion.include_near_mvs; |
| 1209 | if include_near_mvs { |
| 1210 | if mv_stack.len() >= 3 { |
| 1211 | inter_mode_set.push((PredictionMode::NEAR1MV, i)); |
| 1212 | } |
| 1213 | if mv_stack.len() >= 4 { |
| 1214 | inter_mode_set.push((PredictionMode::NEAR2MV, i)); |
| 1215 | } |
| 1216 | } |
| 1217 | let same_row_col = |x: &CandidateMV| { |
| 1218 | x.this_mv.row == mvs_from_me[i][0].row |
| 1219 | && x.this_mv.col == mvs_from_me[i][0].col |
| 1220 | }; |
| 1221 | if !mv_stack |
| 1222 | .iter() |
| 1223 | .take(if include_near_mvs { 4 } else { 2 }) |
| 1224 | .any(same_row_col) |
| 1225 | && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0) |
| 1226 | { |
| 1227 | inter_mode_set.push((PredictionMode::NEWMV, i)); |
| 1228 | } |
| 1229 | |
| 1230 | mv_stacks.push(mv_stack); |
| 1231 | } |
| 1232 | |
| 1233 | let sz = bsize.width_mi().min(bsize.height_mi()); |
| 1234 | |
| 1235 | // To use non single reference modes, block width and height must be greater than 4. |
| 1236 | if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 { |
| 1237 | // Adding compound candidate |
| 1238 | if let Some(r0) = fwdref { |
| 1239 | if let Some(r1) = bwdref { |
| 1240 | let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]]; |
| 1241 | ref_frames_set.push(ref_frames); |
| 1242 | let mv0 = mvs_from_me[r0][0]; |
| 1243 | let mv1 = mvs_from_me[r1][0]; |
| 1244 | mvs_from_me.push([mv0, mv1]); |
| 1245 | let mut mv_stack = ArrayVec::<CandidateMV, 9>::new(); |
| 1246 | mode_contexts.push(cw.find_mvrefs( |
| 1247 | tile_bo, |
| 1248 | ref_frames, |
| 1249 | &mut mv_stack, |
| 1250 | bsize, |
| 1251 | fi, |
| 1252 | true, |
| 1253 | )); |
| 1254 | for &x in RAV1E_INTER_COMPOUND_MODES { |
| 1255 | // exclude any NEAR mode based on speed setting |
| 1256 | if fi.config.speed_settings.motion.include_near_mvs |
| 1257 | || !x.has_nearmv() |
| 1258 | { |
| 1259 | let mv_stack_idx = ref_frames_set.len() - 1; |
| 1260 | // exclude NEAR modes if the mv_stack is too short |
| 1261 | if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) { |
| 1262 | inter_mode_set.push((x, mv_stack_idx)); |
| 1263 | } |
| 1264 | } |
| 1265 | } |
| 1266 | mv_stacks.push(mv_stack); |
| 1267 | } |
| 1268 | } |
| 1269 | } |
| 1270 | |
| 1271 | let num_modes_rdo = if fi.config.speed_settings.prediction.prediction_modes |
| 1272 | >= PredictionModesSetting::ComplexAll |
| 1273 | { |
| 1274 | inter_mode_set.len() |
| 1275 | } else { |
| 1276 | 9 // This number is determined by AWCY test |
| 1277 | }; |
| 1278 | |
| 1279 | inter_mode_set.iter().for_each(|&(luma_mode, i)| { |
| 1280 | let mvs = match luma_mode { |
| 1281 | PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i], |
| 1282 | PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => { |
| 1283 | if !mv_stacks[i].is_empty() { |
| 1284 | [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv] |
| 1285 | } else { |
| 1286 | [MotionVector::default(); 2] |
| 1287 | } |
| 1288 | } |
| 1289 | PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => { |
| 1290 | if mv_stacks[i].len() > 1 { |
| 1291 | [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv] |
| 1292 | } else { |
| 1293 | [MotionVector::default(); 2] |
| 1294 | } |
| 1295 | } |
| 1296 | PredictionMode::NEAR1MV |
| 1297 | | PredictionMode::NEAR2MV |
| 1298 | | PredictionMode::NEAR_NEAR1MV |
| 1299 | | PredictionMode::NEAR_NEAR2MV => [ |
| 1300 | mv_stacks[i][luma_mode.ref_mv_idx()].this_mv, |
| 1301 | mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv, |
| 1302 | ], |
| 1303 | PredictionMode::NEAREST_NEWMV => { |
| 1304 | [mv_stacks[i][0].this_mv, mvs_from_me[i][1]] |
| 1305 | } |
| 1306 | PredictionMode::NEW_NEARESTMV => { |
| 1307 | [mvs_from_me[i][0], mv_stacks[i][0].comp_mv] |
| 1308 | } |
| 1309 | PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => { |
| 1310 | [MotionVector::default(); 2] |
| 1311 | } |
| 1312 | _ => { |
| 1313 | unimplemented!(); |
| 1314 | } |
| 1315 | }; |
| 1316 | mvs_set.push(mvs); |
| 1317 | |
| 1318 | // Calculate SATD for each mode |
| 1319 | if num_modes_rdo != inter_mode_set.len() { |
| 1320 | let tile_rect = ts.tile_rect(); |
| 1321 | let rec = &mut ts.rec.planes[0]; |
| 1322 | let po = tile_bo.plane_offset(rec.plane_cfg); |
| 1323 | let mut rec_region = |
| 1324 | rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); |
| 1325 | |
| 1326 | luma_mode.predict_inter( |
| 1327 | fi, |
| 1328 | tile_rect, |
| 1329 | 0, |
| 1330 | po, |
| 1331 | &mut rec_region, |
| 1332 | bsize.width(), |
| 1333 | bsize.height(), |
| 1334 | ref_frames_set[i], |
| 1335 | mvs, |
| 1336 | &mut ts.inter_compound_buffers, |
| 1337 | ); |
| 1338 | |
| 1339 | let plane_org = ts.input_tile.planes[0] |
| 1340 | .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); |
| 1341 | let plane_ref = rec_region.as_const(); |
| 1342 | |
| 1343 | let satd = get_satd( |
| 1344 | &plane_org, |
| 1345 | &plane_ref, |
| 1346 | bsize.width(), |
| 1347 | bsize.height(), |
| 1348 | fi.sequence.bit_depth, |
| 1349 | fi.cpu_feature_level, |
| 1350 | ); |
| 1351 | satds.push(satd); |
| 1352 | } else { |
| 1353 | satds.push(0); |
| 1354 | } |
| 1355 | }); |
| 1356 | |
| 1357 | let mut sorted = |
| 1358 | izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, 20>>(); |
| 1359 | if num_modes_rdo != sorted.len() { |
| 1360 | sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd); |
| 1361 | } |
| 1362 | |
| 1363 | sorted.iter().take(num_modes_rdo).for_each( |
| 1364 | |&((luma_mode, i), mvs, _satd)| { |
| 1365 | let mode_set_chroma = ArrayVec::from([luma_mode]); |
| 1366 | |
| 1367 | luma_chroma_mode_rdo( |
| 1368 | luma_mode, |
| 1369 | fi, |
| 1370 | bsize, |
| 1371 | tile_bo, |
| 1372 | ts, |
| 1373 | cw, |
| 1374 | rdo_type, |
| 1375 | cw_checkpoint, |
| 1376 | &mut best, |
| 1377 | mvs, |
| 1378 | ref_frames_set[i], |
| 1379 | &mode_set_chroma, |
| 1380 | false, |
| 1381 | mode_contexts[i], |
| 1382 | &mv_stacks[i], |
| 1383 | AngleDelta::default(), |
| 1384 | ); |
| 1385 | }, |
| 1386 | ); |
| 1387 | |
| 1388 | best |
| 1389 | } |
| 1390 | |
| 1391 | #[profiling::function ] |
| 1392 | fn intra_frame_rdo_mode_decision<T: Pixel>( |
| 1393 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1394 | cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, |
| 1395 | cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, |
| 1396 | mut best: PartitionParameters, is_chroma_block: bool, |
| 1397 | ) -> PartitionParameters { |
| 1398 | let mut modes = ArrayVec::<_, INTRA_MODES>::new(); |
| 1399 | |
| 1400 | // Reduce number of prediction modes at higher speed levels |
| 1401 | let num_modes_rdo = if (fi.frame_type == FrameType::KEY |
| 1402 | && fi.config.speed_settings.prediction.prediction_modes |
| 1403 | >= PredictionModesSetting::ComplexKeyframes) |
| 1404 | || (fi.frame_type.has_inter() |
| 1405 | && fi.config.speed_settings.prediction.prediction_modes |
| 1406 | >= PredictionModesSetting::ComplexAll) |
| 1407 | { |
| 1408 | 7 |
| 1409 | } else { |
| 1410 | 3 |
| 1411 | }; |
| 1412 | |
| 1413 | let intra_mode_set = RAV1E_INTRA_MODES; |
| 1414 | |
| 1415 | // Find mode with lowest rate cost |
| 1416 | { |
| 1417 | use crate::ec::cdf_to_pdf; |
| 1418 | |
| 1419 | let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() { |
| 1420 | cw.get_cdf_intra_mode(bsize) |
| 1421 | } else { |
| 1422 | cw.get_cdf_intra_mode_kf(tile_bo) |
| 1423 | }); |
| 1424 | |
| 1425 | modes.try_extend_from_slice(intra_mode_set).unwrap(); |
| 1426 | modes.sort_by_key(|&a| !probs_all[a as usize]); |
| 1427 | } |
| 1428 | |
| 1429 | // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening |
| 1430 | // may be improved by emulating prediction for each tx block. |
| 1431 | { |
| 1432 | let satds = { |
| 1433 | // FIXME: If tx partition is used, this whole sads block should be fixed |
| 1434 | let tx_size = bsize.tx_size(); |
| 1435 | let mut edge_buf = Aligned::uninit_array(); |
| 1436 | let edge_buf = { |
| 1437 | let rec = &ts.rec.planes[0].as_const(); |
| 1438 | let po = tile_bo.plane_offset(rec.plane_cfg); |
| 1439 | // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block |
| 1440 | get_intra_edges( |
| 1441 | &mut edge_buf, |
| 1442 | rec, |
| 1443 | tile_bo, |
| 1444 | 0, |
| 1445 | 0, |
| 1446 | bsize, |
| 1447 | po, |
| 1448 | tx_size, |
| 1449 | fi.sequence.bit_depth, |
| 1450 | None, |
| 1451 | fi.sequence.enable_intra_edge_filter, |
| 1452 | IntraParam::None, |
| 1453 | ) |
| 1454 | }; |
| 1455 | |
| 1456 | let ief_params = if fi.sequence.enable_intra_edge_filter { |
| 1457 | let above_block_info = ts.above_block_info(tile_bo, 0, 0); |
| 1458 | let left_block_info = ts.left_block_info(tile_bo, 0, 0); |
| 1459 | Some(IntraEdgeFilterParameters::new( |
| 1460 | 0, |
| 1461 | above_block_info, |
| 1462 | left_block_info, |
| 1463 | )) |
| 1464 | } else { |
| 1465 | None |
| 1466 | }; |
| 1467 | |
| 1468 | let mut satds_all = [0; INTRA_MODES]; |
| 1469 | for &luma_mode in modes.iter().skip(num_modes_rdo / 2) { |
| 1470 | let tile_rect = ts.tile_rect(); |
| 1471 | let rec = &mut ts.rec.planes[0]; |
| 1472 | let mut rec_region = |
| 1473 | rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); |
| 1474 | // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block |
| 1475 | luma_mode.predict_intra( |
| 1476 | tile_rect, |
| 1477 | &mut rec_region, |
| 1478 | tx_size, |
| 1479 | fi.sequence.bit_depth, |
| 1480 | &[0i16; 2], |
| 1481 | IntraParam::None, |
| 1482 | if luma_mode.is_directional() { ief_params } else { None }, |
| 1483 | &edge_buf, |
| 1484 | fi.cpu_feature_level, |
| 1485 | ); |
| 1486 | |
| 1487 | let plane_org = ts.input_tile.planes[0] |
| 1488 | .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); |
| 1489 | let plane_ref = rec_region.as_const(); |
| 1490 | |
| 1491 | satds_all[luma_mode as usize] = get_satd( |
| 1492 | &plane_org, |
| 1493 | &plane_ref, |
| 1494 | tx_size.width(), |
| 1495 | tx_size.height(), |
| 1496 | fi.sequence.bit_depth, |
| 1497 | fi.cpu_feature_level, |
| 1498 | ); |
| 1499 | } |
| 1500 | satds_all |
| 1501 | }; |
| 1502 | |
| 1503 | modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]); |
| 1504 | } |
| 1505 | |
| 1506 | debug_assert!(num_modes_rdo >= 1); |
| 1507 | |
| 1508 | modes.iter().take(num_modes_rdo).for_each(|&luma_mode| { |
| 1509 | let mvs = [MotionVector::default(); 2]; |
| 1510 | let ref_frames = [INTRA_FRAME, NONE_FRAME]; |
| 1511 | let mut mode_set_chroma = ArrayVec::<_, 2>::new(); |
| 1512 | mode_set_chroma.push(luma_mode); |
| 1513 | if is_chroma_block && luma_mode != PredictionMode::DC_PRED { |
| 1514 | mode_set_chroma.push(PredictionMode::DC_PRED); |
| 1515 | } |
| 1516 | luma_chroma_mode_rdo( |
| 1517 | luma_mode, |
| 1518 | fi, |
| 1519 | bsize, |
| 1520 | tile_bo, |
| 1521 | ts, |
| 1522 | cw, |
| 1523 | rdo_type, |
| 1524 | cw_checkpoint, |
| 1525 | &mut best, |
| 1526 | mvs, |
| 1527 | ref_frames, |
| 1528 | &mode_set_chroma, |
| 1529 | true, |
| 1530 | 0, |
| 1531 | &ArrayVec::<CandidateMV, 9>::new(), |
| 1532 | AngleDelta::default(), |
| 1533 | ); |
| 1534 | }); |
| 1535 | |
| 1536 | if fi.config.speed_settings.prediction.fine_directional_intra |
| 1537 | && bsize >= BlockSize::BLOCK_8X8 |
| 1538 | { |
| 1539 | // Find the best angle delta for the current best prediction mode |
| 1540 | let luma_deltas = best.pred_mode_luma.angle_delta_count(); |
| 1541 | let chroma_deltas = best.pred_mode_chroma.angle_delta_count(); |
| 1542 | |
| 1543 | let mvs = [MotionVector::default(); 2]; |
| 1544 | let ref_frames = [INTRA_FRAME, NONE_FRAME]; |
| 1545 | let mode_set_chroma = [best.pred_mode_chroma]; |
| 1546 | let mv_stack = ArrayVec::<_, 9>::new(); |
| 1547 | let mut best_angle_delta = best.angle_delta; |
| 1548 | let mut angle_delta_rdo = |y, uv| -> AngleDelta { |
| 1549 | if best.angle_delta.y != y || best.angle_delta.uv != uv { |
| 1550 | luma_chroma_mode_rdo( |
| 1551 | best.pred_mode_luma, |
| 1552 | fi, |
| 1553 | bsize, |
| 1554 | tile_bo, |
| 1555 | ts, |
| 1556 | cw, |
| 1557 | rdo_type, |
| 1558 | cw_checkpoint, |
| 1559 | &mut best, |
| 1560 | mvs, |
| 1561 | ref_frames, |
| 1562 | &mode_set_chroma, |
| 1563 | true, |
| 1564 | 0, |
| 1565 | &mv_stack, |
| 1566 | AngleDelta { y, uv }, |
| 1567 | ); |
| 1568 | } |
| 1569 | best.angle_delta |
| 1570 | }; |
| 1571 | |
| 1572 | for i in 0..luma_deltas { |
| 1573 | let angle_delta_y = |
| 1574 | if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 }; |
| 1575 | best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv); |
| 1576 | } |
| 1577 | for j in 0..chroma_deltas { |
| 1578 | let angle_delta_uv = |
| 1579 | if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 }; |
| 1580 | best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv); |
| 1581 | } |
| 1582 | } |
| 1583 | |
| 1584 | best |
| 1585 | } |
| 1586 | |
| 1587 | /// # Panics |
| 1588 | /// |
| 1589 | /// - If the block size is invalid for subsampling. |
| 1590 | #[profiling::function ] |
| 1591 | pub fn rdo_cfl_alpha<T: Pixel>( |
| 1592 | ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, |
| 1593 | luma_tx_size: TxSize, fi: &FrameInvariants<T>, |
| 1594 | ) -> Option<CFLParams> { |
| 1595 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 1596 | let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec); |
| 1597 | debug_assert!( |
| 1598 | bsize.subsampled_size(xdec, ydec).unwrap() == uv_tx_size.block_size() |
| 1599 | ); |
| 1600 | |
| 1601 | let frame_bo = ts.to_frame_block_offset(tile_bo); |
| 1602 | let (visible_tx_w, visible_tx_h) = clip_visible_bsize( |
| 1603 | (fi.width + xdec) >> xdec, |
| 1604 | (fi.height + ydec) >> ydec, |
| 1605 | uv_tx_size.block_size(), |
| 1606 | (frame_bo.0.x << MI_SIZE_LOG2) >> xdec, |
| 1607 | (frame_bo.0.y << MI_SIZE_LOG2) >> ydec, |
| 1608 | ); |
| 1609 | |
| 1610 | if visible_tx_w == 0 || visible_tx_h == 0 { |
| 1611 | return None; |
| 1612 | }; |
| 1613 | let mut ac = Aligned::<[MaybeUninit<i16>; 32 * 32]>::uninit_array(); |
| 1614 | let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi); |
| 1615 | let best_alpha: ArrayVec<i16, 2> = (1..3) |
| 1616 | .map(|p| { |
| 1617 | let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg; |
| 1618 | let tile_rect = ts.tile_rect().decimated(xdec, ydec); |
| 1619 | let rec = &mut ts.rec.planes[p]; |
| 1620 | let input = &ts.input_tile.planes[p]; |
| 1621 | let po = tile_bo.plane_offset(rec.plane_cfg); |
| 1622 | let mut edge_buf = Aligned::uninit_array(); |
| 1623 | let edge_buf = get_intra_edges( |
| 1624 | &mut edge_buf, |
| 1625 | &rec.as_const(), |
| 1626 | tile_bo, |
| 1627 | 0, |
| 1628 | 0, |
| 1629 | bsize, |
| 1630 | po, |
| 1631 | uv_tx_size, |
| 1632 | fi.sequence.bit_depth, |
| 1633 | Some(PredictionMode::UV_CFL_PRED), |
| 1634 | fi.sequence.enable_intra_edge_filter, |
| 1635 | IntraParam::None, |
| 1636 | ); |
| 1637 | let mut alpha_cost = |alpha: i16| -> u64 { |
| 1638 | let mut rec_region = |
| 1639 | rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); |
| 1640 | PredictionMode::UV_CFL_PRED.predict_intra( |
| 1641 | tile_rect, |
| 1642 | &mut rec_region, |
| 1643 | uv_tx_size, |
| 1644 | fi.sequence.bit_depth, |
| 1645 | ac, |
| 1646 | IntraParam::Alpha(alpha), |
| 1647 | None, |
| 1648 | &edge_buf, |
| 1649 | fi.cpu_feature_level, |
| 1650 | ); |
| 1651 | sse_wxh( |
| 1652 | &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }), |
| 1653 | &rec_region.as_const(), |
| 1654 | visible_tx_w, |
| 1655 | visible_tx_h, |
| 1656 | |_, _| DistortionScale::default(), // We're not doing RDO here. |
| 1657 | fi.sequence.bit_depth, |
| 1658 | fi.cpu_feature_level, |
| 1659 | ) |
| 1660 | .0 |
| 1661 | }; |
| 1662 | let mut best = (alpha_cost(0), 0); |
| 1663 | let mut count = 2; |
| 1664 | for alpha in 1i16..=16i16 { |
| 1665 | let cost = (alpha_cost(alpha), alpha_cost(-alpha)); |
| 1666 | if cost.0 < best.0 { |
| 1667 | best = (cost.0, alpha); |
| 1668 | count += 2; |
| 1669 | } |
| 1670 | if cost.1 < best.0 { |
| 1671 | best = (cost.1, -alpha); |
| 1672 | count += 2; |
| 1673 | } |
| 1674 | if count < alpha { |
| 1675 | break; |
| 1676 | } |
| 1677 | } |
| 1678 | best.1 |
| 1679 | }) |
| 1680 | .collect(); |
| 1681 | |
| 1682 | if best_alpha[0] == 0 && best_alpha[1] == 0 { |
| 1683 | None |
| 1684 | } else { |
| 1685 | Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1])) |
| 1686 | } |
| 1687 | } |
| 1688 | |
| 1689 | /// RDO-based transform type decision |
| 1690 | /// If `cw_checkpoint` is `None`, a checkpoint for cw's (`ContextWriter`) current |
| 1691 | /// state is created and stored for later use. |
| 1692 | /// |
| 1693 | /// # Panics |
| 1694 | /// |
| 1695 | /// - If a writer checkpoint is never created before or within the function. |
| 1696 | /// This should never happen and indicates a development error. |
| 1697 | /// - If the best RD found is negative. |
| 1698 | /// This should never happen and indicates a development error. |
| 1699 | pub fn rdo_tx_type_decision<T: Pixel>( |
| 1700 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1701 | cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>, |
| 1702 | mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], |
| 1703 | bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet, |
| 1704 | tx_types: &[TxType], cur_best_rd: f64, |
| 1705 | ) -> (TxType, f64) { |
| 1706 | let mut best_type = TxType::DCT_DCT; |
| 1707 | let mut best_rd = std::f64::MAX; |
| 1708 | |
| 1709 | let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; |
| 1710 | let is_chroma_block = |
| 1711 | has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); |
| 1712 | |
| 1713 | let is_inter = !mode.is_intra(); |
| 1714 | |
| 1715 | if cw_checkpoint.is_none() { |
| 1716 | // Only run the first call |
| 1717 | // Prevents creating multiple checkpoints for own version of cw |
| 1718 | *cw_checkpoint = |
| 1719 | Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling)); |
| 1720 | } |
| 1721 | |
| 1722 | let rdo_type = if fi.use_tx_domain_distortion { |
| 1723 | RDOType::TxDistRealRate |
| 1724 | } else { |
| 1725 | RDOType::PixelDistRealRate |
| 1726 | }; |
| 1727 | let need_recon_pixel = tx_size.block_size() != bsize && !is_inter; |
| 1728 | |
| 1729 | let mut first_iteration = true; |
| 1730 | for &tx_type in tx_types { |
| 1731 | // Skip unsupported transform types |
| 1732 | if av1_tx_used[tx_set as usize][tx_type as usize] == 0 { |
| 1733 | continue; |
| 1734 | } |
| 1735 | |
| 1736 | if is_inter { |
| 1737 | motion_compensate( |
| 1738 | fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true, |
| 1739 | ); |
| 1740 | } |
| 1741 | |
| 1742 | let mut wr = WriterCounter::new(); |
| 1743 | let tell = wr.tell_frac(); |
| 1744 | let (_, tx_dist) = if is_inter { |
| 1745 | write_tx_tree( |
| 1746 | fi, |
| 1747 | ts, |
| 1748 | cw, |
| 1749 | &mut wr, |
| 1750 | mode, |
| 1751 | 0, |
| 1752 | tile_bo, |
| 1753 | bsize, |
| 1754 | tx_size, |
| 1755 | tx_type, |
| 1756 | false, |
| 1757 | true, |
| 1758 | rdo_type, |
| 1759 | need_recon_pixel, |
| 1760 | ) |
| 1761 | } else { |
| 1762 | write_tx_blocks( |
| 1763 | fi, |
| 1764 | ts, |
| 1765 | cw, |
| 1766 | &mut wr, |
| 1767 | mode, |
| 1768 | mode, |
| 1769 | AngleDelta::default(), |
| 1770 | tile_bo, |
| 1771 | bsize, |
| 1772 | tx_size, |
| 1773 | tx_type, |
| 1774 | false, |
| 1775 | CFLParams::default(), // Unused. |
| 1776 | true, |
| 1777 | rdo_type, |
| 1778 | need_recon_pixel, |
| 1779 | ) |
| 1780 | }; |
| 1781 | |
| 1782 | let rate = wr.tell_frac() - tell; |
| 1783 | let distortion = if fi.use_tx_domain_distortion { |
| 1784 | compute_tx_distortion( |
| 1785 | fi, |
| 1786 | ts, |
| 1787 | bsize, |
| 1788 | is_chroma_block, |
| 1789 | tile_bo, |
| 1790 | tx_dist, |
| 1791 | false, |
| 1792 | true, |
| 1793 | ) |
| 1794 | } else { |
| 1795 | compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true) |
| 1796 | }; |
| 1797 | cw.rollback(cw_checkpoint.as_ref().unwrap()); |
| 1798 | |
| 1799 | let rd = compute_rd_cost(fi, rate, distortion); |
| 1800 | |
| 1801 | if first_iteration { |
| 1802 | // We use an optimization to early exit after testing the first |
| 1803 | // transform type if the cost is higher than the existing best. |
| 1804 | // The idea is that if this transform size is not better than he |
| 1805 | // previous size, it is not worth testing remaining modes for this size. |
| 1806 | if rd > cur_best_rd { |
| 1807 | break; |
| 1808 | } |
| 1809 | first_iteration = false; |
| 1810 | } |
| 1811 | |
| 1812 | if rd < best_rd { |
| 1813 | best_rd = rd; |
| 1814 | best_type = tx_type; |
| 1815 | } |
| 1816 | } |
| 1817 | |
| 1818 | assert!(best_rd >= 0_f64); |
| 1819 | |
| 1820 | (best_type, best_rd) |
| 1821 | } |
| 1822 | |
| 1823 | pub fn get_sub_partitions( |
| 1824 | four_partitions: &[TileBlockOffset; 4], partition: PartitionType, |
| 1825 | ) -> ArrayVec<TileBlockOffset, 4> { |
| 1826 | let mut partition_offsets: ArrayVec = ArrayVec::<TileBlockOffset, 4>::new(); |
| 1827 | |
| 1828 | partition_offsets.push(element:four_partitions[0]); |
| 1829 | |
| 1830 | if partition == PARTITION_NONE { |
| 1831 | return partition_offsets; |
| 1832 | } |
| 1833 | if partition == PARTITION_VERT || partition == PARTITION_SPLIT { |
| 1834 | partition_offsets.push(element:four_partitions[1]); |
| 1835 | }; |
| 1836 | if partition == PARTITION_HORZ || partition == PARTITION_SPLIT { |
| 1837 | partition_offsets.push(element:four_partitions[2]); |
| 1838 | }; |
| 1839 | if partition == PARTITION_SPLIT { |
| 1840 | partition_offsets.push(element:four_partitions[3]); |
| 1841 | }; |
| 1842 | |
| 1843 | partition_offsets |
| 1844 | } |
| 1845 | |
| 1846 | #[inline (always)] |
| 1847 | fn rdo_partition_none<T: Pixel>( |
| 1848 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1849 | cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, |
| 1850 | inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, 4>, |
| 1851 | ) -> f64 { |
| 1852 | debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); |
| 1853 | |
| 1854 | let mode: PartitionParameters = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); |
| 1855 | let cost: f64 = mode.rd_cost; |
| 1856 | |
| 1857 | child_modes.push(element:mode); |
| 1858 | |
| 1859 | cost |
| 1860 | } |
| 1861 | |
| 1862 | // VERTICAL, HORIZONTAL or simple SPLIT |
| 1863 | #[inline (always)] |
| 1864 | fn rdo_partition_simple<T: Pixel, W: Writer>( |
| 1865 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1866 | cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, |
| 1867 | bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, |
| 1868 | partition: PartitionType, rdo_type: RDOType, best_rd: f64, |
| 1869 | child_modes: &mut ArrayVec<PartitionParameters, 4>, |
| 1870 | ) -> Option<f64> { |
| 1871 | debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); |
| 1872 | let subsize = bsize.subsize(partition).unwrap(); |
| 1873 | |
| 1874 | let cost = if bsize >= BlockSize::BLOCK_8X8 { |
| 1875 | let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; |
| 1876 | let tell = w.tell_frac(); |
| 1877 | cw.write_partition(w, tile_bo, partition, bsize); |
| 1878 | compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero()) |
| 1879 | } else { |
| 1880 | 0.0 |
| 1881 | }; |
| 1882 | |
| 1883 | let hbsw = subsize.width_mi(); // Half the block size width in blocks |
| 1884 | let hbsh = subsize.height_mi(); // Half the block size height in blocks |
| 1885 | let four_partitions = [ |
| 1886 | tile_bo, |
| 1887 | TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }), |
| 1888 | TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }), |
| 1889 | TileBlockOffset(BlockOffset { |
| 1890 | x: tile_bo.0.x + hbsw, |
| 1891 | y: tile_bo.0.y + hbsh, |
| 1892 | }), |
| 1893 | ]; |
| 1894 | |
| 1895 | let partitions = get_sub_partitions(&four_partitions, partition); |
| 1896 | |
| 1897 | let mut rd_cost_sum = 0.0; |
| 1898 | |
| 1899 | for offset in partitions { |
| 1900 | let hbs = subsize.width_mi() >> 1; |
| 1901 | let has_cols = offset.0.x + hbs < ts.mi_width; |
| 1902 | let has_rows = offset.0.y + hbs < ts.mi_height; |
| 1903 | |
| 1904 | if has_cols && has_rows { |
| 1905 | let mode_decision = |
| 1906 | rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg); |
| 1907 | |
| 1908 | rd_cost_sum += mode_decision.rd_cost; |
| 1909 | |
| 1910 | if fi.enable_early_exit && rd_cost_sum > best_rd { |
| 1911 | return None; |
| 1912 | } |
| 1913 | if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() { |
| 1914 | let w: &mut W = |
| 1915 | if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; |
| 1916 | cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize); |
| 1917 | } |
| 1918 | encode_block_with_modes( |
| 1919 | fi, |
| 1920 | ts, |
| 1921 | cw, |
| 1922 | w_pre_cdef, |
| 1923 | w_post_cdef, |
| 1924 | subsize, |
| 1925 | offset, |
| 1926 | &mode_decision, |
| 1927 | rdo_type, |
| 1928 | None, |
| 1929 | ); |
| 1930 | child_modes.push(mode_decision); |
| 1931 | } else { |
| 1932 | //rd_cost_sum += std::f64::MAX; |
| 1933 | return None; |
| 1934 | } |
| 1935 | } |
| 1936 | |
| 1937 | Some(cost + rd_cost_sum) |
| 1938 | } |
| 1939 | |
| 1940 | /// RDO-based single level partitioning decision |
| 1941 | /// |
| 1942 | /// # Panics |
| 1943 | /// |
| 1944 | /// - If the best RD found is negative. |
| 1945 | /// This should never happen, and indicates a development error. |
| 1946 | #[profiling::function ] |
| 1947 | pub fn rdo_partition_decision<T: Pixel, W: Writer>( |
| 1948 | fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>, |
| 1949 | cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, |
| 1950 | bsize: BlockSize, tile_bo: TileBlockOffset, |
| 1951 | cached_block: &PartitionGroupParameters, partition_types: &[PartitionType], |
| 1952 | rdo_type: RDOType, inter_cfg: &InterConfig, |
| 1953 | ) -> PartitionGroupParameters { |
| 1954 | let mut best_partition = cached_block.part_type; |
| 1955 | let mut best_rd = cached_block.rd_cost; |
| 1956 | let mut best_pred_modes = cached_block.part_modes.clone(); |
| 1957 | |
| 1958 | let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); |
| 1959 | let w_pre_checkpoint = w_pre_cdef.checkpoint(); |
| 1960 | let w_post_checkpoint = w_post_cdef.checkpoint(); |
| 1961 | |
| 1962 | for &partition in partition_types { |
| 1963 | // Do not re-encode results we already have |
| 1964 | if partition == cached_block.part_type { |
| 1965 | continue; |
| 1966 | } |
| 1967 | |
| 1968 | let mut child_modes = ArrayVec::<_, 4>::new(); |
| 1969 | |
| 1970 | let cost = match partition { |
| 1971 | PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => { |
| 1972 | Some(rdo_partition_none( |
| 1973 | fi, |
| 1974 | ts, |
| 1975 | cw, |
| 1976 | bsize, |
| 1977 | tile_bo, |
| 1978 | inter_cfg, |
| 1979 | &mut child_modes, |
| 1980 | )) |
| 1981 | } |
| 1982 | PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => { |
| 1983 | rdo_partition_simple( |
| 1984 | fi, |
| 1985 | ts, |
| 1986 | cw, |
| 1987 | w_pre_cdef, |
| 1988 | w_post_cdef, |
| 1989 | bsize, |
| 1990 | tile_bo, |
| 1991 | inter_cfg, |
| 1992 | partition, |
| 1993 | rdo_type, |
| 1994 | best_rd, |
| 1995 | &mut child_modes, |
| 1996 | ) |
| 1997 | } |
| 1998 | _ => { |
| 1999 | unreachable!(); |
| 2000 | } |
| 2001 | }; |
| 2002 | |
| 2003 | if let Some(rd) = cost { |
| 2004 | if rd < best_rd { |
| 2005 | best_rd = rd; |
| 2006 | best_partition = partition; |
| 2007 | best_pred_modes = child_modes.clone(); |
| 2008 | } |
| 2009 | } |
| 2010 | cw.rollback(&cw_checkpoint); |
| 2011 | w_pre_cdef.rollback(&w_pre_checkpoint); |
| 2012 | w_post_cdef.rollback(&w_post_checkpoint); |
| 2013 | } |
| 2014 | |
| 2015 | assert!(best_rd >= 0_f64); |
| 2016 | |
| 2017 | PartitionGroupParameters { |
| 2018 | rd_cost: best_rd, |
| 2019 | part_type: best_partition, |
| 2020 | part_modes: best_pred_modes, |
| 2021 | } |
| 2022 | } |
| 2023 | |
| 2024 | #[profiling::function ] |
| 2025 | fn rdo_loop_plane_error<T: Pixel>( |
| 2026 | base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, |
| 2027 | sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, |
| 2028 | blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize, |
| 2029 | ) -> ScaledDistortion { |
| 2030 | let sb_w_blocks = |
| 2031 | if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w; |
| 2032 | let sb_h_blocks = |
| 2033 | if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h; |
| 2034 | // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma |
| 2035 | // accumulating in-frame and unpadded |
| 2036 | let mut err = Distortion::zero(); |
| 2037 | for by in 0..sb_h_blocks { |
| 2038 | for bx in 0..sb_w_blocks { |
| 2039 | let loop_bo = offset_sbo.block_offset(bx << 1, by << 1); |
| 2040 | if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() { |
| 2041 | let src_plane = &src.planes[pli]; |
| 2042 | let test_plane = &test.planes[pli]; |
| 2043 | let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg; |
| 2044 | debug_assert_eq!(xdec, test_plane.cfg.xdec); |
| 2045 | debug_assert_eq!(ydec, test_plane.cfg.ydec); |
| 2046 | |
| 2047 | // Unfortunately, our distortion biases are only available via |
| 2048 | // Frame-absolute addressing, so we need a block offset |
| 2049 | // relative to the full frame origin (not the tile or analysis |
| 2050 | // area) |
| 2051 | let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1); |
| 2052 | let bias = distortion_scale( |
| 2053 | fi, |
| 2054 | ts.to_frame_block_offset(frame_bo), |
| 2055 | BlockSize::BLOCK_8X8, |
| 2056 | ); |
| 2057 | |
| 2058 | let src_region = |
| 2059 | src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 }); |
| 2060 | let test_region = |
| 2061 | test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 }); |
| 2062 | |
| 2063 | err += if pli == 0 { |
| 2064 | // For loop filters, We intentionally use cdef_dist even with |
| 2065 | // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a |
| 2066 | // significant negative impact on other metrics and visual quality. |
| 2067 | RawDistortion(cdef_dist_kernel( |
| 2068 | &src_region, |
| 2069 | &test_region, |
| 2070 | 8, |
| 2071 | 8, |
| 2072 | fi.sequence.bit_depth, |
| 2073 | fi.cpu_feature_level, |
| 2074 | ) as u64) |
| 2075 | * bias |
| 2076 | } else { |
| 2077 | sse_wxh( |
| 2078 | &src_region, |
| 2079 | &test_region, |
| 2080 | 8 >> xdec, |
| 2081 | 8 >> ydec, |
| 2082 | |_, _| bias, |
| 2083 | fi.sequence.bit_depth, |
| 2084 | fi.cpu_feature_level, |
| 2085 | ) |
| 2086 | }; |
| 2087 | } |
| 2088 | } |
| 2089 | } |
| 2090 | err * fi.dist_scale[pli] |
| 2091 | } |
| 2092 | |
| 2093 | /// Passed in a superblock offset representing the upper left corner of |
| 2094 | /// the LRU area we're optimizing. This area covers the largest LRU in |
| 2095 | /// any of the present planes, but may consist of a number of |
| 2096 | /// superblocks and full, smaller LRUs in the other planes |
| 2097 | /// |
| 2098 | /// # Panics |
| 2099 | /// |
| 2100 | /// - If both CDEF and LRF are disabled. |
| 2101 | #[profiling::function ] |
| 2102 | pub fn rdo_loop_decision<T: Pixel, W: Writer>( |
| 2103 | base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>, |
| 2104 | ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, |
| 2105 | deblock_p: bool, |
| 2106 | ) { |
| 2107 | let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { |
| 2108 | 1 |
| 2109 | } else { |
| 2110 | MAX_PLANES |
| 2111 | }; |
| 2112 | assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration); |
| 2113 | // Determine area of optimization: Which plane has the largest LRUs? |
| 2114 | // How many LRUs for each? |
| 2115 | let mut sb_w = 1; // how many superblocks wide the largest LRU |
| 2116 | // is/how many SBs we're processing (same thing) |
| 2117 | let mut sb_h = 1; // how many superblocks wide the largest LRU |
| 2118 | // is/how many SBs we're processing (same thing) |
| 2119 | let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing |
| 2120 | let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing |
| 2121 | for pli in 0..planes { |
| 2122 | let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift; |
| 2123 | let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift; |
| 2124 | if sb_w < (1 << sb_h_shift) { |
| 2125 | sb_w = 1 << sb_h_shift; |
| 2126 | } |
| 2127 | if sb_h < (1 << sb_v_shift) { |
| 2128 | sb_h = 1 << sb_v_shift; |
| 2129 | } |
| 2130 | } |
| 2131 | for pli in 0..planes { |
| 2132 | let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift; |
| 2133 | let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift; |
| 2134 | lru_w[pli] = sb_w / (1 << sb_h_shift); |
| 2135 | lru_h[pli] = sb_h / (1 << sb_v_shift); |
| 2136 | } |
| 2137 | |
| 2138 | // The superblock width/height determinations may be calling for us |
| 2139 | // to compute over superblocks that do not actually exist in the |
| 2140 | // frame (off the right or lower edge). Trim sb width/height down |
| 2141 | // to actual superblocks. Note that these last superblocks on the |
| 2142 | // right/bottom may themselves still span the edge of the frame, but |
| 2143 | // they do hold at least some visible pixels. |
| 2144 | sb_w = sb_w.min(ts.sb_width - base_sbo.0.x); |
| 2145 | sb_h = sb_h.min(ts.sb_height - base_sbo.0.y); |
| 2146 | |
| 2147 | // We have need to know the Y visible pixel limits as well (the |
| 2148 | // sb_w/sb_h figures above can be used to determine how many |
| 2149 | // allocated pixels, possibly beyond the visible frame, exist). |
| 2150 | let crop_w = |
| 2151 | fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT); |
| 2152 | let crop_h = |
| 2153 | fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT); |
| 2154 | let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT); |
| 2155 | let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT); |
| 2156 | |
| 2157 | // Based on `RestorationState::new` |
| 2158 | const MAX_SB_SHIFT: usize = 4; |
| 2159 | const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT; |
| 2160 | const MAX_LRU_SIZE: usize = MAX_SB_SIZE; |
| 2161 | |
| 2162 | // Static allocation relies on the "minimal LRU area for all N planes" invariant. |
| 2163 | let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE]; |
| 2164 | let mut best_lrf = |
| 2165 | [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; |
| 2166 | |
| 2167 | // due to imprecision in the reconstruction parameter solver, we |
| 2168 | // need to make sure we don't fall into a limit cycle. Track our |
| 2169 | // best cost at LRF so that we can break if we get a solution that doesn't |
| 2170 | // improve at the reconstruction stage. |
| 2171 | let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; |
| 2172 | |
| 2173 | // sub-setted region of the TileBlocks for our working frame area. |
| 2174 | // Note that the size of this subset is what signals CDEF as to the |
| 2175 | // actual coded size. |
| 2176 | let mut tileblocks_subset = cw.bc.blocks.subregion_mut( |
| 2177 | base_sbo.block_offset(0, 0).0.x, |
| 2178 | base_sbo.block_offset(0, 0).0.y, |
| 2179 | sb_w << SUPERBLOCK_TO_BLOCK_SHIFT, |
| 2180 | sb_h << SUPERBLOCK_TO_BLOCK_SHIFT, |
| 2181 | ); |
| 2182 | |
| 2183 | // cdef doesn't run on superblocks that are completely skipped. |
| 2184 | // Determine which super blocks are marked as skipped so we can avoid running |
| 2185 | // them. If all blocks are skipped, we can avoid some of the overhead related |
| 2186 | // to setting up for cdef. |
| 2187 | let mut cdef_skip = [true; MAX_SB_SIZE * MAX_SB_SIZE]; |
| 2188 | let mut cdef_skip_all = true; |
| 2189 | if fi.sequence.enable_cdef { |
| 2190 | for sby in 0..sb_h { |
| 2191 | for sbx in 0..sb_w { |
| 2192 | let blocks = tileblocks_subset.subregion(16 * sbx, 16 * sby, 16, 16); |
| 2193 | let mut skip = true; |
| 2194 | for y in 0..blocks.rows() { |
| 2195 | for block in blocks[y].iter() { |
| 2196 | skip &= block.skip; |
| 2197 | } |
| 2198 | } |
| 2199 | cdef_skip[sby * MAX_SB_SIZE + sbx] = skip; |
| 2200 | cdef_skip_all &= skip; |
| 2201 | } |
| 2202 | } |
| 2203 | } |
| 2204 | |
| 2205 | // Unlike cdef, loop restoration will run regardless of whether blocks are |
| 2206 | // skipped or not. At the same time, the most significant improvement will |
| 2207 | // generally be from un-skipped blocks, so lru is only performed if there are |
| 2208 | // un-skipped blocks. |
| 2209 | // This should be the same as `cdef_skip_all`, except when cdef is disabled. |
| 2210 | let mut lru_skip_all = true; |
| 2211 | let mut lru_skip = [[true; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; |
| 2212 | if fi.sequence.enable_restoration { |
| 2213 | if fi.config.speed_settings.lru_on_skip { |
| 2214 | lru_skip_all = false; |
| 2215 | lru_skip = [[false; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; |
| 2216 | } else { |
| 2217 | for pli in 0..planes { |
| 2218 | // width, in sb, of an LRU in this plane |
| 2219 | let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift; |
| 2220 | // height, in sb, of an LRU in this plane |
| 2221 | let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift; |
| 2222 | for lru_y in 0..lru_h[pli] { |
| 2223 | // number of LRUs vertically |
| 2224 | for lru_x in 0..lru_w[pli] { |
| 2225 | // number of LRUs horizontally |
| 2226 | |
| 2227 | let loop_sbo = TileSuperBlockOffset(SuperBlockOffset { |
| 2228 | x: lru_x * lru_sb_w, |
| 2229 | y: lru_y * lru_sb_h, |
| 2230 | }); |
| 2231 | |
| 2232 | if !ts.restoration.has_restoration_unit( |
| 2233 | base_sbo + loop_sbo, |
| 2234 | pli, |
| 2235 | false, |
| 2236 | ) { |
| 2237 | continue; |
| 2238 | } |
| 2239 | |
| 2240 | let start = loop_sbo.block_offset(0, 0).0; |
| 2241 | let size = TileSuperBlockOffset(SuperBlockOffset { |
| 2242 | x: lru_sb_w, |
| 2243 | y: lru_sb_h, |
| 2244 | }) |
| 2245 | .block_offset(0, 0) |
| 2246 | .0; |
| 2247 | |
| 2248 | let blocks = |
| 2249 | tileblocks_subset.subregion(start.x, start.y, size.x, size.y); |
| 2250 | let mut skip = true; |
| 2251 | for y in 0..blocks.rows() { |
| 2252 | for block in blocks[y].iter() { |
| 2253 | skip &= block.skip; |
| 2254 | } |
| 2255 | } |
| 2256 | lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] = skip; |
| 2257 | lru_skip_all &= skip; |
| 2258 | } |
| 2259 | } |
| 2260 | } |
| 2261 | } |
| 2262 | } |
| 2263 | |
| 2264 | // Return early if all blocks are skipped for lru and cdef. |
| 2265 | if lru_skip_all && cdef_skip_all { |
| 2266 | return; |
| 2267 | } |
| 2268 | |
| 2269 | // Loop filter RDO is an iterative process and we need temporary |
| 2270 | // scratch data to hold the results of deblocking, cdef, and the |
| 2271 | // loop reconstruction filter so that each can be partially updated |
| 2272 | // without recomputing the entire stack. Construct |
| 2273 | // largest-LRU-sized frames for each, accounting for padding |
| 2274 | // required by deblocking, cdef and [optionally] LR. |
| 2275 | let mut rec_subset = ts |
| 2276 | .rec |
| 2277 | .subregion(Area::BlockRect { |
| 2278 | bo: base_sbo.block_offset(0, 0).0, |
| 2279 | width: (pixel_w + 7) >> 3 << 3, |
| 2280 | height: (pixel_h + 7) >> 3 << 3, |
| 2281 | }) |
| 2282 | .scratch_copy(); |
| 2283 | |
| 2284 | // const, no need to copy, just need the subregion (but do zero the |
| 2285 | // origin to match the other copies/new backing frames). |
| 2286 | let src_subset = ts |
| 2287 | .input_tile |
| 2288 | .subregion(Area::BlockRect { |
| 2289 | bo: base_sbo.block_offset(0, 0).0, |
| 2290 | width: (pixel_w + 7) >> 3 << 3, |
| 2291 | height: (pixel_h + 7) >> 3 << 3, |
| 2292 | }) |
| 2293 | .home(); |
| 2294 | |
| 2295 | if deblock_p { |
| 2296 | // Find a good deblocking filter solution for the passed in area. |
| 2297 | // This is not RDO of deblocking itself, merely a solution to get |
| 2298 | // better results from CDEF/LRF RDO. |
| 2299 | let deblock_levels = deblock_filter_optimize( |
| 2300 | fi, |
| 2301 | &rec_subset.as_tile(), |
| 2302 | &src_subset, |
| 2303 | &tileblocks_subset.as_const(), |
| 2304 | crop_w, |
| 2305 | crop_h, |
| 2306 | ); |
| 2307 | |
| 2308 | // Deblock the contents of our reconstruction copy. |
| 2309 | if deblock_levels[0] != 0 || deblock_levels[1] != 0 { |
| 2310 | // copy ts.deblock because we need to set some of our own values here |
| 2311 | let mut deblock_copy = *ts.deblock; |
| 2312 | deblock_copy.levels = deblock_levels; |
| 2313 | |
| 2314 | // finally, deblock the temp frame |
| 2315 | deblock_filter_frame( |
| 2316 | &deblock_copy, |
| 2317 | &mut rec_subset.as_tile_mut(), |
| 2318 | &tileblocks_subset.as_const(), |
| 2319 | crop_w, |
| 2320 | crop_h, |
| 2321 | fi.sequence.bit_depth, |
| 2322 | planes, |
| 2323 | ); |
| 2324 | } |
| 2325 | } |
| 2326 | |
| 2327 | let mut cdef_work = |
| 2328 | if !cdef_skip_all { Some(rec_subset.clone()) } else { None }; |
| 2329 | let mut lrf_work = if !lru_skip_all { |
| 2330 | Some(Frame { |
| 2331 | planes: { |
| 2332 | let new_plane = |pli: usize| { |
| 2333 | let PlaneConfig { xdec, ydec, width, height, .. } = |
| 2334 | rec_subset.planes[pli].cfg; |
| 2335 | Plane::new(width, height, xdec, ydec, 0, 0) |
| 2336 | }; |
| 2337 | [new_plane(0), new_plane(1), new_plane(2)] |
| 2338 | }, |
| 2339 | }) |
| 2340 | } else { |
| 2341 | None |
| 2342 | }; |
| 2343 | |
| 2344 | // Precompute directional analysis for CDEF |
| 2345 | let cdef_data = { |
| 2346 | if cdef_work.is_some() { |
| 2347 | Some(( |
| 2348 | &rec_subset, |
| 2349 | cdef_analyze_superblock_range( |
| 2350 | fi, |
| 2351 | &rec_subset, |
| 2352 | &tileblocks_subset.as_const(), |
| 2353 | sb_w, |
| 2354 | sb_h, |
| 2355 | ), |
| 2356 | )) |
| 2357 | } else { |
| 2358 | None |
| 2359 | } |
| 2360 | }; |
| 2361 | |
| 2362 | // CDEF/LRF decision iteration |
| 2363 | // Start with a default of CDEF 0 and RestorationFilter::None |
| 2364 | // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it. |
| 2365 | // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it. |
| 2366 | // If LRF choice changed for any plane, repeat until no changes |
| 2367 | // Limit iterations and where we break based on speed setting (in the TODO list ;-) |
| 2368 | let mut cdef_change = true; |
| 2369 | let mut lrf_change = true; |
| 2370 | while cdef_change || lrf_change { |
| 2371 | // search for improved cdef indices, superblock by superblock, if cdef is enabled. |
| 2372 | if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) = |
| 2373 | (&cdef_data, &mut cdef_work.as_mut()) |
| 2374 | { |
| 2375 | for sby in 0..sb_h { |
| 2376 | for sbx in 0..sb_w { |
| 2377 | // determine whether this superblock can be skipped |
| 2378 | if cdef_skip[sby * MAX_SB_SIZE + sbx] { |
| 2379 | continue; |
| 2380 | } |
| 2381 | |
| 2382 | let prev_best_index = best_index[sby * sb_w + sbx]; |
| 2383 | let mut best_cost = -1.; |
| 2384 | let mut best_new_index = -1i8; |
| 2385 | |
| 2386 | /* offset of the superblock we're currently testing within the larger |
| 2387 | analysis area */ |
| 2388 | let loop_sbo = |
| 2389 | TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); |
| 2390 | |
| 2391 | /* cdef index testing loop */ |
| 2392 | for cdef_index in 0..(1 << fi.cdef_bits) { |
| 2393 | let mut err = ScaledDistortion::zero(); |
| 2394 | let mut rate = 0; |
| 2395 | |
| 2396 | cdef_filter_superblock( |
| 2397 | fi, |
| 2398 | &rec_subset, |
| 2399 | &mut cdef_ref.as_tile_mut(), |
| 2400 | &tileblocks_subset.as_const(), |
| 2401 | loop_sbo, |
| 2402 | cdef_index, |
| 2403 | &cdef_dirs[sby * sb_w + sbx], |
| 2404 | ); |
| 2405 | // apply LRF if any |
| 2406 | for pli in 0..planes { |
| 2407 | // We need the cropped-to-visible-frame area of this SB |
| 2408 | let wh = |
| 2409 | if fi.sequence.use_128x128_superblock { 128 } else { 64 }; |
| 2410 | let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg; |
| 2411 | let vis_width = (wh >> xdec).min( |
| 2412 | (crop_w >> xdec) |
| 2413 | - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x |
| 2414 | as usize, |
| 2415 | ); |
| 2416 | let vis_height = (wh >> ydec).min( |
| 2417 | (crop_h >> ydec) |
| 2418 | - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y |
| 2419 | as usize, |
| 2420 | ); |
| 2421 | // which LRU are we currently testing against? |
| 2422 | if let (Some((lru_x, lru_y)), Some(lrf_ref)) = { |
| 2423 | let rp = &ts.restoration.planes[pli]; |
| 2424 | ( |
| 2425 | rp.restoration_unit_offset(base_sbo, loop_sbo, false), |
| 2426 | &mut lrf_work, |
| 2427 | ) |
| 2428 | } { |
| 2429 | // We have a valid LRU, apply LRF, compute error |
| 2430 | match best_lrf[lru_y * lru_w[pli] + lru_x][pli] { |
| 2431 | RestorationFilter::None {} => { |
| 2432 | err += rdo_loop_plane_error( |
| 2433 | base_sbo, |
| 2434 | loop_sbo, |
| 2435 | 1, |
| 2436 | 1, |
| 2437 | fi, |
| 2438 | ts, |
| 2439 | &tileblocks_subset.as_const(), |
| 2440 | cdef_ref, |
| 2441 | &src_subset, |
| 2442 | pli, |
| 2443 | ); |
| 2444 | rate += if fi.sequence.enable_restoration { |
| 2445 | cw.fc.count_lrf_switchable( |
| 2446 | w, |
| 2447 | &ts.restoration.as_const(), |
| 2448 | best_lrf[lru_y * lru_w[pli] + lru_x][pli], |
| 2449 | pli, |
| 2450 | ) |
| 2451 | } else { |
| 2452 | 0 // no relative cost differeneces to different |
| 2453 | // CDEF params. If cdef is on, it's a wash. |
| 2454 | }; |
| 2455 | } |
| 2456 | RestorationFilter::Sgrproj { set, xqd } => { |
| 2457 | // only run on this single superblock |
| 2458 | let loop_po = |
| 2459 | loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg); |
| 2460 | // todo: experiment with borrowing border pixels |
| 2461 | // rather than edge-extending. Right now this is |
| 2462 | // hard-clipping to the superblock boundary. |
| 2463 | setup_integral_image( |
| 2464 | &mut ts.integral_buffer, |
| 2465 | SOLVE_IMAGE_STRIDE, |
| 2466 | vis_width, |
| 2467 | vis_height, |
| 2468 | vis_width, |
| 2469 | vis_height, |
| 2470 | &cdef_ref.planes[pli].slice(loop_po), |
| 2471 | &cdef_ref.planes[pli].slice(loop_po), |
| 2472 | ); |
| 2473 | sgrproj_stripe_filter( |
| 2474 | set, |
| 2475 | xqd, |
| 2476 | fi, |
| 2477 | &ts.integral_buffer, |
| 2478 | SOLVE_IMAGE_STRIDE, |
| 2479 | &cdef_ref.planes[pli].slice(loop_po), |
| 2480 | &mut lrf_ref.planes[pli].region_mut(Area::Rect { |
| 2481 | x: loop_po.x, |
| 2482 | y: loop_po.y, |
| 2483 | width: vis_width, |
| 2484 | height: vis_height, |
| 2485 | }), |
| 2486 | ); |
| 2487 | err += rdo_loop_plane_error( |
| 2488 | base_sbo, |
| 2489 | loop_sbo, |
| 2490 | 1, |
| 2491 | 1, |
| 2492 | fi, |
| 2493 | ts, |
| 2494 | &tileblocks_subset.as_const(), |
| 2495 | lrf_ref, |
| 2496 | &src_subset, |
| 2497 | pli, |
| 2498 | ); |
| 2499 | rate += cw.fc.count_lrf_switchable( |
| 2500 | w, |
| 2501 | &ts.restoration.as_const(), |
| 2502 | best_lrf[lru_y * lru_w[pli] + lru_x][pli], |
| 2503 | pli, |
| 2504 | ); |
| 2505 | } |
| 2506 | RestorationFilter::Wiener { .. } => unreachable!(), // coming soon |
| 2507 | } |
| 2508 | } else { |
| 2509 | // No actual LRU here, compute error directly from CDEF output. |
| 2510 | err += rdo_loop_plane_error( |
| 2511 | base_sbo, |
| 2512 | loop_sbo, |
| 2513 | 1, |
| 2514 | 1, |
| 2515 | fi, |
| 2516 | ts, |
| 2517 | &tileblocks_subset.as_const(), |
| 2518 | cdef_ref, |
| 2519 | &src_subset, |
| 2520 | pli, |
| 2521 | ); |
| 2522 | // no relative cost differeneces to different |
| 2523 | // CDEF params. If cdef is on, it's a wash. |
| 2524 | // rate += 0; |
| 2525 | } |
| 2526 | } |
| 2527 | |
| 2528 | let cost = compute_rd_cost(fi, rate, err); |
| 2529 | if best_cost < 0. || cost < best_cost { |
| 2530 | best_cost = cost; |
| 2531 | best_new_index = cdef_index as i8; |
| 2532 | } |
| 2533 | } |
| 2534 | |
| 2535 | // Did we change any preexisting choices? |
| 2536 | if best_new_index != prev_best_index { |
| 2537 | cdef_change = true; |
| 2538 | best_index[sby * sb_w + sbx] = best_new_index; |
| 2539 | tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8); |
| 2540 | } |
| 2541 | |
| 2542 | let mut cdef_ref_tm = TileMut::new( |
| 2543 | cdef_ref, |
| 2544 | TileRect { |
| 2545 | x: 0, |
| 2546 | y: 0, |
| 2547 | width: cdef_ref.planes[0].cfg.width, |
| 2548 | height: cdef_ref.planes[0].cfg.height, |
| 2549 | }, |
| 2550 | ); |
| 2551 | |
| 2552 | // Keep cdef output up to date; we need it for restoration |
| 2553 | // both below and above (padding) |
| 2554 | cdef_filter_superblock( |
| 2555 | fi, |
| 2556 | rec_copy, |
| 2557 | &mut cdef_ref_tm, |
| 2558 | &tileblocks_subset.as_const(), |
| 2559 | loop_sbo, |
| 2560 | best_index[sby * sb_w + sbx] as u8, |
| 2561 | &cdef_dirs[sby * sb_w + sbx], |
| 2562 | ); |
| 2563 | } |
| 2564 | } |
| 2565 | } |
| 2566 | |
| 2567 | if !cdef_change { |
| 2568 | break; |
| 2569 | } |
| 2570 | cdef_change = false; |
| 2571 | lrf_change = false; |
| 2572 | |
| 2573 | // search for improved restoration filter parameters if restoration is enabled |
| 2574 | if let Some(lrf_ref) = &mut lrf_work.as_mut() { |
| 2575 | let lrf_input = if cdef_work.is_some() { |
| 2576 | // When CDEF is enabled, we pull from the CDEF output |
| 2577 | cdef_work.as_ref().unwrap() |
| 2578 | } else { |
| 2579 | // When CDEF is disabled, we pull from the [optionally |
| 2580 | // deblocked] reconstruction |
| 2581 | &rec_subset |
| 2582 | }; |
| 2583 | for pli in 0..planes { |
| 2584 | // Nominal size of LRU in pixels before clipping to visible frame |
| 2585 | let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size; |
| 2586 | // width, in sb, of an LRU in this plane |
| 2587 | let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift; |
| 2588 | // height, in sb, of an LRU in this plane |
| 2589 | let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift; |
| 2590 | let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg; |
| 2591 | for lru_y in 0..lru_h[pli] { |
| 2592 | // number of LRUs vertically |
| 2593 | for lru_x in 0..lru_w[pli] { |
| 2594 | // number of LRUs horizontally |
| 2595 | |
| 2596 | // determine whether this lru should be skipped |
| 2597 | if lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] { |
| 2598 | continue; |
| 2599 | } |
| 2600 | |
| 2601 | let loop_sbo = TileSuperBlockOffset(SuperBlockOffset { |
| 2602 | x: lru_x * lru_sb_w, |
| 2603 | y: lru_y * lru_sb_h, |
| 2604 | }); |
| 2605 | if ts.restoration.has_restoration_unit( |
| 2606 | base_sbo + loop_sbo, |
| 2607 | pli, |
| 2608 | false, |
| 2609 | ) { |
| 2610 | let src_plane = &src_subset.planes[pli]; // uncompressed input for reference |
| 2611 | let lrf_in_plane = &lrf_input.planes[pli]; |
| 2612 | let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg); |
| 2613 | let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli]; |
| 2614 | let mut best_cost = |
| 2615 | best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli]; |
| 2616 | |
| 2617 | // Check the no filter option |
| 2618 | { |
| 2619 | let err = rdo_loop_plane_error( |
| 2620 | base_sbo, |
| 2621 | loop_sbo, |
| 2622 | lru_sb_w, |
| 2623 | lru_sb_h, |
| 2624 | fi, |
| 2625 | ts, |
| 2626 | &tileblocks_subset.as_const(), |
| 2627 | lrf_input, |
| 2628 | &src_subset, |
| 2629 | pli, |
| 2630 | ); |
| 2631 | let rate = cw.fc.count_lrf_switchable( |
| 2632 | w, |
| 2633 | &ts.restoration.as_const(), |
| 2634 | best_new_lrf, |
| 2635 | pli, |
| 2636 | ); |
| 2637 | |
| 2638 | let cost = compute_rd_cost(fi, rate, err); |
| 2639 | // Was this choice actually an improvement? |
| 2640 | if best_cost < 0. || cost < best_cost { |
| 2641 | best_cost = cost; |
| 2642 | best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost; |
| 2643 | best_new_lrf = RestorationFilter::None; |
| 2644 | } |
| 2645 | } |
| 2646 | |
| 2647 | // Look for a self guided filter |
| 2648 | // We need the cropped-to-visible-frame computation area of this LRU |
| 2649 | let vis_width = unit_size.min( |
| 2650 | (crop_w >> xdec) |
| 2651 | - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize, |
| 2652 | ); |
| 2653 | let vis_height = unit_size.min( |
| 2654 | (crop_h >> ydec) |
| 2655 | - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize, |
| 2656 | ); |
| 2657 | |
| 2658 | // todo: experiment with borrowing border pixels |
| 2659 | // rather than edge-extending. Right now this is |
| 2660 | // hard-clipping to the superblock boundary. |
| 2661 | setup_integral_image( |
| 2662 | &mut ts.integral_buffer, |
| 2663 | SOLVE_IMAGE_STRIDE, |
| 2664 | vis_width, |
| 2665 | vis_height, |
| 2666 | vis_width, |
| 2667 | vis_height, |
| 2668 | &lrf_in_plane.slice(lrf_po), |
| 2669 | &lrf_in_plane.slice(lrf_po), |
| 2670 | ); |
| 2671 | |
| 2672 | for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity) |
| 2673 | { |
| 2674 | let (xqd0, xqd1) = sgrproj_solve( |
| 2675 | set, |
| 2676 | fi, |
| 2677 | &ts.integral_buffer, |
| 2678 | &src_plane |
| 2679 | .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }), |
| 2680 | &lrf_in_plane.slice(lrf_po), |
| 2681 | vis_width, |
| 2682 | vis_height, |
| 2683 | ); |
| 2684 | let current_lrf = |
| 2685 | RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] }; |
| 2686 | if let RestorationFilter::Sgrproj { set, xqd } = current_lrf { |
| 2687 | sgrproj_stripe_filter( |
| 2688 | set, |
| 2689 | xqd, |
| 2690 | fi, |
| 2691 | &ts.integral_buffer, |
| 2692 | SOLVE_IMAGE_STRIDE, |
| 2693 | &lrf_in_plane.slice(lrf_po), |
| 2694 | &mut lrf_ref.planes[pli].region_mut(Area::Rect { |
| 2695 | x: lrf_po.x, |
| 2696 | y: lrf_po.y, |
| 2697 | width: vis_width, |
| 2698 | height: vis_height, |
| 2699 | }), |
| 2700 | ); |
| 2701 | } |
| 2702 | let err = rdo_loop_plane_error( |
| 2703 | base_sbo, |
| 2704 | loop_sbo, |
| 2705 | lru_sb_w, |
| 2706 | lru_sb_h, |
| 2707 | fi, |
| 2708 | ts, |
| 2709 | &tileblocks_subset.as_const(), |
| 2710 | lrf_ref, |
| 2711 | &src_subset, |
| 2712 | pli, |
| 2713 | ); |
| 2714 | let rate = cw.fc.count_lrf_switchable( |
| 2715 | w, |
| 2716 | &ts.restoration.as_const(), |
| 2717 | current_lrf, |
| 2718 | pli, |
| 2719 | ); |
| 2720 | let cost = compute_rd_cost(fi, rate, err); |
| 2721 | if cost < best_cost { |
| 2722 | best_cost = cost; |
| 2723 | best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost; |
| 2724 | best_new_lrf = current_lrf; |
| 2725 | } |
| 2726 | } |
| 2727 | |
| 2728 | if best_lrf[lru_y * lru_w[pli] + lru_x][pli] |
| 2729 | .notequal(best_new_lrf) |
| 2730 | { |
| 2731 | best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf; |
| 2732 | lrf_change = true; |
| 2733 | if let Some(ru) = ts.restoration.planes[pli] |
| 2734 | .restoration_unit_mut(base_sbo + loop_sbo) |
| 2735 | { |
| 2736 | ru.filter = best_new_lrf; |
| 2737 | } |
| 2738 | } |
| 2739 | } |
| 2740 | } |
| 2741 | } |
| 2742 | } |
| 2743 | } |
| 2744 | } |
| 2745 | } |
| 2746 | |
| 2747 | #[test ] |
| 2748 | fn estimate_rate_test() { |
| 2749 | assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]); |
| 2750 | } |
| 2751 | |