rdo.rs source code [crates/rav1e/src/rdo.rs]

1	// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
2	// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved
3	//
4	// This source code is subject to the terms of the BSD 2 Clause License and
5	// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6	// was not distributed with this source code in the LICENSE file, you can
7	// obtain it at www.aomedia.org/license/software. If the Alliance for Open
8	// Media Patent License 1.0 was not distributed with this source code in the
9	// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
11	#![allow(non_camel_case_types)]
12
13	use crate::api::*;
14	use crate::cdef::*;
15	use crate::context::*;
16	use crate::cpu_features::CpuFeatureLevel;
17	use crate::deblock::*;
18	use crate::dist::*;
19	use crate::ec::{Writer, WriterCounter, OD_BITRES};
20	use crate::encode_block_with_modes;
21	use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE};
22	use crate::frame::Frame;
23	use crate::frame::*;
24	use crate::header::ReferenceMode;
25	use crate::lrf::*;
26	use crate::mc::MotionVector;
27	use crate::me::estimate_motion;
28	use crate::me::MVSamplingMode;
29	use crate::me::MotionSearchResult;
30	use crate::motion_compensate;
31	use crate::partition::PartitionType::*;
32	use crate::partition::RefType::*;
33	use crate::partition::*;
34	use crate::predict::{
35	luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode,
36	RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES,
37	};
38	use crate::rdo_tables::*;
39	use crate::tiling::*;
40	use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES};
41	use crate::util::{init_slice_repeat_mut, Aligned, Pixel};
42	use crate::write_tx_blocks;
43	use crate::write_tx_tree;
44	use crate::Tune;
45	use crate::{encode_block_post_cdef, encode_block_pre_cdef};
46
47	use arrayvec::*;
48	use itertools::izip;
49	use std::fmt;
50	use std::mem::MaybeUninit;
51
52	#[derive(Copy, Clone, PartialEq, Eq)]
53	pub enum RDOType {
54	PixelDistRealRate,
55	TxDistRealRate,
56	TxDistEstRate,
57	}
58
59	impl RDOType {
60	#[inline]
61	pub const fn needs_tx_dist(self) -> bool {
62	match self {
63	// Pixel-domain distortion and exact ec rate
64	RDOType::PixelDistRealRate => `false`,
65	// Tx-domain distortion and exact ec rate
66	RDOType::TxDistRealRate => `true`,
67	// Tx-domain distortion and txdist-based rate
68	RDOType::TxDistEstRate => `true`,
69	}
70	}
71	#[inline]
72	pub const fn needs_coeff_rate(self) -> bool {
73	match self {
74	RDOType::PixelDistRealRate => `true`,
75	RDOType::TxDistRealRate => `true`,
76	RDOType::TxDistEstRate => `false`,
77	}
78	}
79	}
80
81	#[derive(Clone)]
82	pub struct PartitionGroupParameters {
83	pub rd_cost: f64,
84	pub part_type: PartitionType,
85	pub part_modes: ArrayVec<PartitionParameters, `4`>,
86	}
87
88	#[derive(Clone, Debug)]
89	pub struct PartitionParameters {
90	pub rd_cost: f64,
91	pub bo: TileBlockOffset,
92	pub bsize: BlockSize,
93	pub pred_mode_luma: PredictionMode,
94	pub pred_mode_chroma: PredictionMode,
95	pub pred_cfl_params: CFLParams,
96	pub angle_delta: AngleDelta,
97	pub ref_frames: [RefType; `2`],
98	pub mvs: [MotionVector; `2`],
99	pub skip: bool,
100	pub has_coeff: bool,
101	pub tx_size: TxSize,
102	pub tx_type: TxType,
103	pub sidx: u8,
104	}
105
106	impl Default for PartitionParameters {
107	fn default() -> Self {
108	PartitionParameters {
109	rd_cost: std::f64::MAX,
110	bo: TileBlockOffset::default(),
111	bsize: BlockSize::BLOCK_32X32,
112	pred_mode_luma: PredictionMode::default(),
113	pred_mode_chroma: PredictionMode::default(),
114	pred_cfl_params: CFLParams::default(),
115	angle_delta: AngleDelta::default(),
116	ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME],
117	mvs: [MotionVector::default(); `2`],
118	skip: `false`,
119	has_coeff: `true`,
120	tx_size: TxSize::TX_4X4,
121	tx_type: TxType::DCT_DCT,
122	sidx: `0`,
123	}
124	}
125	}
126
127	pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 {
128	let bs_index: usize = ts as usize;
129	let q_bin_idx: usize = (qindex as usize) / RDO_QUANT_DIV;
130	let bin_idx_down: u64 =
131	((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - `2`) as u64);
132	let bin_idx_up: u64 = (bin_idx_down + `1`).min((RDO_NUM_BINS - `1`) as u64);
133	let x0: i64 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64;
134	let x1: i64 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64;
135	let y0: i64 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64;
136	let y1: i64 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64;
137	let slope: i64 = ((y1 - y0) << `8`) / (x1 - x0);
138	(y0 + (((fast_distortion as i64 - x0) * slope) >> `8`)).max(`0`) as u64
139	}
140
141	#[allow(unused)]
142	pub fn cdef_dist_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
143	src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
144	bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel,
145	) -> Distortion {
146	debug_assert!(src1.plane_cfg.xdec == `0`);
147	debug_assert!(src1.plane_cfg.ydec == `0`);
148	debug_assert!(src2.plane_cfg.xdec == `0`);
149	debug_assert!(src2.plane_cfg.ydec == `0`);
150
151	let mut sum = Distortion::zero();
152	for y in (`0`..h).step_by(`8`) {
153	for x in (`0`..w).step_by(`8`) {
154	let kernel_h = (h - y).min(`8`);
155	let kernel_w = (w - x).min(`8`);
156	let area = Area::StartingAt { x: x as isize, y: y as isize };
157
158	let value = RawDistortion(cdef_dist_kernel(
159	&src1.subregion(area),
160	&src2.subregion(area),
161	kernel_w,
162	kernel_h,
163	bit_depth,
164	cpu,
165	) as u64);
166
167	// cdef is always called on non-subsampled planes, so BLOCK_8X8 is
168	// correct here.
169	sum += value * compute_bias(area, BlockSize::BLOCK_8X8);
170	}
171	}
172	sum
173	}
174
175	/// Sum of Squared Error for a wxh block
176	/// Currently limited to w and h of valid blocks
177	pub fn sse_wxh<T: Pixel, F: Fn(Area, BlockSize) -> DistortionScale>(
178	src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize,
179	compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel,
180	) -> Distortion {
181	// See get_weighted_sse in src/dist.rs.
182	// Provide a scale to get_weighted_sse for each square region of this size.
183	const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> `1`;
184
185	// To bias the distortion correctly, compute it in blocks up to the size
186	// importance block size in a non-subsampled plane.
187	let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec;
188	let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec;
189
190	let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h);
191
192	let n_imp_blocks_w = (w + CHUNK_SIZE - `1`) / CHUNK_SIZE;
193	let n_imp_blocks_h = (h + CHUNK_SIZE - `1`) / CHUNK_SIZE;
194
195	// TODO: Copying biases into a buffer is slow. It would be best if biases were
196	// passed directly. To do this, we would need different versions of the
197	// weighted sse function for decimated/subsampled data. Also requires
198	// eliminating use of unbiased sse.
199	// It should also be noted that the current copy code does not auto-vectorize.
200
201	// Copy biases into a buffer.
202	let mut buf_storage = Aligned::new(
203	[MaybeUninit::<u32>::uninit(); `128` / CHUNK_SIZE * `128` / CHUNK_SIZE],
204	);
205	let buf_stride = n_imp_blocks_w.next_power_of_two();
206	let buf = init_slice_repeat_mut(
207	&mut buf_storage.data[..buf_stride * n_imp_blocks_h],
208	`0`,
209	);
210
211	for block_y in `0`..n_imp_blocks_h {
212	for block_x in `0`..n_imp_blocks_w {
213	let block = Area::StartingAt {
214	x: (block_x * CHUNK_SIZE) as isize,
215	y: (block_y * CHUNK_SIZE) as isize,
216	};
217	buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0;
218	}
219	}
220
221	Distortion(get_weighted_sse(
222	src1, src2, buf, buf_stride, w, h, bit_depth, cpu,
223	))
224	}
225
226	pub const fn clip_visible_bsize(
227	frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize,
228	) -> (usize, usize) {
229	let blk_w: usize = bsize.width();
230	let blk_h: usize = bsize.height();
231
232	let visible_w: usize = if x + blk_w <= frame_w {
233	blk_w
234	} else if x >= frame_w {
235	`0`
236	} else {
237	frame_w - x
238	};
239
240	let visible_h: usize = if y + blk_h <= frame_h {
241	blk_h
242	} else if y >= frame_h {
243	`0`
244	} else {
245	frame_h - y
246	};
247
248	(visible_w, visible_h)
249	}
250
251	// Compute the pixel-domain distortion for an encode
252	fn compute_distortion<T: Pixel>(
253	fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
254	is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool,
255	) -> ScaledDistortion {
256	let area = Area::BlockStartingAt { bo: tile_bo.0 };
257	let input_region = ts.input_tile.planes[`0`].subregion(area);
258	let rec_region = ts.rec.planes[`0`].subregion(area);
259
260	// clip a block to have visible pixles only
261	let frame_bo = ts.to_frame_block_offset(tile_bo);
262	let (visible_w, visible_h) = clip_visible_bsize(
263	fi.width,
264	fi.height,
265	bsize,
266	frame_bo.0.x << MI_SIZE_LOG2,
267	frame_bo.0.y << MI_SIZE_LOG2,
268	);
269
270	if visible_w == `0` \|\| visible_h == `0` {
271	return ScaledDistortion::zero();
272	}
273
274	let mut distortion = match fi.config.tune {
275	Tune::Psychovisual => cdef_dist_wxh(
276	&input_region,
277	&rec_region,
278	visible_w,
279	visible_h,
280	fi.sequence.bit_depth,
281	\|bias_area, bsize\| {
282	distortion_scale(
283	fi,
284	input_region.subregion(bias_area).frame_block_offset(),
285	bsize,
286	)
287	},
288	fi.cpu_feature_level,
289	),
290	Tune::Psnr => sse_wxh(
291	&input_region,
292	&rec_region,
293	visible_w,
294	visible_h,
295	\|bias_area, bsize\| {
296	distortion_scale(
297	fi,
298	input_region.subregion(bias_area).frame_block_offset(),
299	bsize,
300	)
301	},
302	fi.sequence.bit_depth,
303	fi.cpu_feature_level,
304	),
305	} * fi.dist_scale[`0`];
306
307	if is_chroma_block
308	&& !luma_only
309	&& fi.sequence.chroma_sampling != ChromaSampling::Cs400
310	{
311	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
312	let chroma_w = if bsize.width() >= `8` \|\| xdec == `0` {
313	(visible_w + xdec) >> xdec
314	} else {
315	(`4` + visible_w + xdec) >> xdec
316	};
317	let chroma_h = if bsize.height() >= `8` \|\| ydec == `0` {
318	(visible_h + ydec) >> ydec
319	} else {
320	(`4` + visible_h + ydec) >> ydec
321	};
322
323	for p in `1`..`3` {
324	let input_region = ts.input_tile.planes[p].subregion(area);
325	let rec_region = ts.rec.planes[p].subregion(area);
326	distortion += sse_wxh(
327	&input_region,
328	&rec_region,
329	chroma_w,
330	chroma_h,
331	\|bias_area, bsize\| {
332	distortion_scale(
333	fi,
334	input_region.subregion(bias_area).frame_block_offset(),
335	bsize,
336	)
337	},
338	fi.sequence.bit_depth,
339	fi.cpu_feature_level,
340	) * fi.dist_scale[p];
341	}
342	}
343	distortion
344	}
345
346	// Compute the transform-domain distortion for an encode
347	fn compute_tx_distortion<T: Pixel>(
348	fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>, bsize: BlockSize,
349	is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion,
350	skip: bool, luma_only: bool,
351	) -> ScaledDistortion {
352	assert!(fi.config.tune == Tune::Psnr);
353	let area = Area::BlockStartingAt { bo: tile_bo.0 };
354	let input_region = ts.input_tile.planes[`0`].subregion(area);
355	let rec_region = ts.rec.planes[`0`].subregion(area);
356
357	let (visible_w, visible_h) = if !skip {
358	(bsize.width(), bsize.height())
359	} else {
360	let frame_bo = ts.to_frame_block_offset(tile_bo);
361	clip_visible_bsize(
362	fi.width,
363	fi.height,
364	bsize,
365	frame_bo.0.x << MI_SIZE_LOG2,
366	frame_bo.0.y << MI_SIZE_LOG2,
367	)
368	};
369
370	if visible_w == `0` \|\| visible_h == `0` {
371	return ScaledDistortion::zero();
372	}
373
374	let mut distortion = if skip {
375	sse_wxh(
376	&input_region,
377	&rec_region,
378	visible_w,
379	visible_h,
380	\|bias_area, bsize\| {
381	distortion_scale(
382	fi,
383	input_region.subregion(bias_area).frame_block_offset(),
384	bsize,
385	)
386	},
387	fi.sequence.bit_depth,
388	fi.cpu_feature_level,
389	) * fi.dist_scale[`0`]
390	} else {
391	tx_dist
392	};
393
394	if is_chroma_block
395	&& !luma_only
396	&& skip
397	&& fi.sequence.chroma_sampling != ChromaSampling::Cs400
398	{
399	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
400	let chroma_w = if bsize.width() >= `8` \|\| xdec == `0` {
401	(visible_w + xdec) >> xdec
402	} else {
403	(`4` + visible_w + xdec) >> xdec
404	};
405	let chroma_h = if bsize.height() >= `8` \|\| ydec == `0` {
406	(visible_h + ydec) >> ydec
407	} else {
408	(`4` + visible_h + ydec) >> ydec
409	};
410
411	for p in `1`..`3` {
412	let input_region = ts.input_tile.planes[p].subregion(area);
413	let rec_region = ts.rec.planes[p].subregion(area);
414	distortion += sse_wxh(
415	&input_region,
416	&rec_region,
417	chroma_w,
418	chroma_h,
419	\|bias_area, bsize\| {
420	distortion_scale(
421	fi,
422	input_region.subregion(bias_area).frame_block_offset(),
423	bsize,
424	)
425	},
426	fi.sequence.bit_depth,
427	fi.cpu_feature_level,
428	) * fi.dist_scale[p];
429	}
430	}
431	distortion
432	}
433
434	/// Compute a scaling factor to multiply the distortion of a block by,
435	/// this factor is determined using temporal RDO.
436	///
437	/// # Panics
438	///
439	/// - If called with `bsize` of 8x8 or smaller
440	/// - If the coded frame data doesn't exist on the `FrameInvariants`
441	pub fn distortion_scale<T: Pixel>(
442	fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
443	) -> DistortionScale {
444	if !fi.config.temporal_rdo() {
445	return DistortionScale::default();
446	}
447	// EncoderConfig::temporal_rdo() should always return false in situations
448	// where distortion is computed on > 8x8 blocks, so we should never hit this
449	// assert.
450	assert!(bsize <= BlockSize::BLOCK_8X8);
451
452	let x: usize = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
453	let y: usize = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
454
455	let coded_data: &CodedFrameData = fi.coded_frame_data.as_ref().unwrap();
456	coded_data.distortion_scales[y * coded_data.w_in_imp_b + x]
457	}
458
459	/// # Panics
460	///
461	/// - If the coded frame data doesn't exist on the `FrameInvariants`
462	pub fn spatiotemporal_scale<T: Pixel>(
463	fi: &FrameInvariants<T>, frame_bo: PlaneBlockOffset, bsize: BlockSize,
464	) -> DistortionScale {
465	if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual {
466	return DistortionScale::default();
467	}
468
469	let coded_data = fi.coded_frame_data.as_ref().unwrap();
470
471	let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
472	let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT;
473	let x1 = (x0 + bsize.width_imp_b()).min(coded_data.w_in_imp_b);
474	let y1 = (y0 + bsize.height_imp_b()).min(coded_data.h_in_imp_b);
475	let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT;
476
477	// calling this on each slice individually improves autovectorization
478	// compared to using `Iterator::take`
479	#[inline(always)]
480	fn take_slice<T>(slice: &[T], n: usize) -> &[T] {
481	slice.get(..n).unwrap_or(slice)
482	}
483
484	let mut sum = `0`;
485	for y in y0..y1 {
486	sum += take_slice(
487	&coded_data.distortion_scales[y * coded_data.w_in_imp_b..][x0..x1],
488	MAX_SB_IN_IMP_B,
489	)
490	.iter()
491	.zip(
492	take_slice(
493	&coded_data.activity_scales[y * coded_data.w_in_imp_b..][x0..x1],
494	MAX_SB_IN_IMP_B,
495	)
496	.iter(),
497	)
498	.map(\|(d, a)\| d.0 as u64 * a.0 as u64)
499	.sum::<u64>();
500	}
501	DistortionScale(((sum + (den >> `1`)) / den) as u32)
502	}
503
504	pub fn distortion_scale_for(
505	propagate_cost: f64, intra_cost: f64,
506	) -> DistortionScale {
507	// The mbtree paper \cite{mbtree} uses the following formula:
508	//
509	// QP_delta = -strength log2(1 + (propagate_cost / intra_cost))*
510	//
511	// Since this is H.264, this corresponds to the following quantizer:
512	//
513	// Q' = Q 2^(QP_delta/6)*
514	//
515	// Since lambda is proportial to Q^2, this means we want to minimize:
516	//
517	// D + lambda' R*
518	// = D + 2^(QP_delta / 3) lambda * R*
519	//
520	// If we want to keep lambda fixed, we can instead scale distortion and
521	// minimize:
522	//
523	// D scale + lambda * R*
524	//
525	// where:
526	//
527	// scale = 2^(QP_delta / -3)
528	// = (1 + (propagate_cost / intra_cost))^(strength / 3)
529	//
530	// The original paper empirically chooses strength = 2.0, but strength = 1.0
531	// seems to work best in rav1e currently, this may have something to do with
532	// the fact that they use 16x16 blocks whereas our "importance blocks" are
533	// 8x8, but everything should be scale invariant here so that's weird.
534	//
535	// @article{mbtree,
536	// title={A novel macroblock-tree algorithm for high-performance
537	// optimization of dependent video coding in H.264/AVC},
538	// author={Garrett-Glaser, Jason},
539	// journal={Tech. Rep.},
540	// year={2009},
541	// url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf}
542	// }
543
544	if intra_cost == `0.` {
545	return DistortionScale::default(); // no scaling
546	}
547
548	let strength = `1.0`; // empirical, see comment above
549	let frac = (intra_cost + propagate_cost) / intra_cost;
550	frac.powf(strength / `3.0`).into()
551	}
552
553	/// Fixed point arithmetic version of distortion scale
554	#[repr(transparent)]
555	#[derive(Copy, Clone)]
556	pub struct DistortionScale(pub u32);
557
558	#[repr(transparent)]
559	pub struct RawDistortion(u64);
560
561	#[repr(transparent)]
562	pub struct Distortion(pub u64);
563
564	#[repr(transparent)]
565	pub struct ScaledDistortion(u64);
566
567	impl DistortionScale {
568	/// Bits past the radix point
569	const SHIFT: u32 = `14`;
570	/// Number of bits used. Determines the max value.
571	/// 28 bits is quite excessive.
572	const BITS: u32 = `28`;
573	/// Maximum internal value
574	const MAX: u64 = (`1` << Self::BITS) - `1`;
575
576	#[inline]
577	pub const fn new(num: u64, den: u64) -> Self {
578	let raw = (num << Self::SHIFT).saturating_add(den / `2`) / den;
579	let mask = (raw <= Self::MAX) as u64;
580	Self((mask * raw + (`1` - mask) * Self::MAX) as u32)
581	}
582
583	pub fn inv_mean(slice: &[Self]) -> Self {
584	use crate::util::{bexp64, blog32_q11};
585	let sum = slice.iter().map(\|&s\| blog32_q11(s.0) as i64).sum::<i64>();
586	let log_inv_mean_q11 =
587	(Self::SHIFT << `11`) as i64 - sum / slice.len() as i64;
588	Self(
589	bexp64((log_inv_mean_q11 + (Self::SHIFT << `11`) as i64) << (`57` - `11`))
590	.clamp(`1`, (`1` << Self::BITS) - `1`) as u32,
591	)
592	}
593
594	/// Binary logarithm in Q11
595	#[inline]
596	pub const fn blog16(self) -> i16 {
597	use crate::util::blog32_q11;
598	(blog32_q11(self.0) - ((Self::SHIFT as i32) << `11`)) as i16
599	}
600
601	/// Binary logarithm in Q57
602	#[inline]
603	pub const fn blog64(self) -> i64 {
604	use crate::util::{blog64, q57};
605	blog64(self.0 as i64) - q57(Self::SHIFT as i32)
606	}
607
608	/// Multiply, round and shift
609	/// Internal implementation, so don't use multiply trait.
610	#[inline]
611	pub const fn mul_u64(self, dist: u64) -> u64 {
612	(self.0 as u64 * dist + (`1` << Self::SHIFT >> `1`)) >> Self::SHIFT
613	}
614	}
615
616	impl std::ops::Mul for DistortionScale {
617	type Output = Self;
618
619	/// Multiply, round and shift
620	#[inline]
621	fn mul(self, rhs: Self) -> Self {
622	Self(
623	(((self.0 as u64 * rhs.0 as u64) + (`1` << (Self::SHIFT - `1`)))
624	>> Self::SHIFT)
625	.clamp(min:`1`, (`1` << Self::BITS) - `1`) as u32,
626	)
627	}
628	}
629
630	impl std::ops::MulAssign for DistortionScale {
631	fn mul_assign(&mut self, rhs: Self) {
632	self = self * rhs;
633	}
634	}
635
636	// Default value for DistortionScale is a fixed point 1
637	impl Default for DistortionScale {
638	#[inline]
639	fn default() -> Self {
640	Self(`1` << Self::SHIFT)
641	}
642	}
643
644	impl fmt::Debug for DistortionScale {
645	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
646	write!(f, "{}", f64::from(*self))
647	}
648	}
649
650	impl From<f64> for DistortionScale {
651	#[inline]
652	fn from(scale: f64) -> Self {
653	let den: u64 = `1` << (Self::SHIFT + `1`);
654	Self::new((scale * den as f64) as u64, den)
655	}
656	}
657
658	impl From<DistortionScale> for f64 {
659	#[inline]
660	fn from(scale: DistortionScale) -> Self {
661	scale.0 as f64 / (`1` << DistortionScale::SHIFT) as f64
662	}
663	}
664
665	impl RawDistortion {
666	#[inline]
667	pub const fn new(dist: u64) -> Self {
668	Self(dist)
669	}
670	}
671
672	impl std::ops::Mul<DistortionScale> for RawDistortion {
673	type Output = Distortion;
674	#[inline]
675	fn mul(self, rhs: DistortionScale) -> Distortion {
676	Distortion(rhs.mul_u64(self.0))
677	}
678	}
679
680	impl Distortion {
681	#[inline]
682	pub const fn zero() -> Self {
683	Self(`0`)
684	}
685	}
686
687	impl std::ops::Mul<DistortionScale> for Distortion {
688	type Output = ScaledDistortion;
689	#[inline]
690	fn mul(self, rhs: DistortionScale) -> ScaledDistortion {
691	ScaledDistortion(rhs.mul_u64(self.0))
692	}
693	}
694
695	impl std::ops::AddAssign for Distortion {
696	#[inline]
697	fn add_assign(&mut self, other: Self) {
698	self.0 += other.0;
699	}
700	}
701
702	impl ScaledDistortion {
703	#[inline]
704	pub const fn zero() -> Self {
705	Self(`0`)
706	}
707	}
708
709	impl std::ops::AddAssign for ScaledDistortion {
710	#[inline]
711	fn add_assign(&mut self, other: Self) {
712	self.0 += other.0;
713	}
714	}
715
716	pub fn compute_rd_cost<T: Pixel>(
717	fi: &FrameInvariants<T>, rate: u32, distortion: ScaledDistortion,
718	) -> f64 {
719	let rate_in_bits: f64 = (rate as f64) / ((`1` << OD_BITRES) as f64);
720	fi.lambda.mul_add(a:rate_in_bits, b:distortion.0 as f64)
721	}
722
723	pub fn rdo_tx_size_type<T: Pixel>(
724	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
725	cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
726	luma_mode: PredictionMode, ref_frames: [RefType; `2`], mvs: [MotionVector; `2`],
727	skip: bool,
728	) -> (TxSize, TxType) {
729	let is_inter = !luma_mode.is_intra();
730	let mut tx_size = max_txsize_rect_lookup[bsize as usize];
731
732	if fi.enable_inter_txfm_split && is_inter && !skip {
733	tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size
734	}
735
736	let mut best_tx_type = TxType::DCT_DCT;
737	let mut best_tx_size = tx_size;
738	let mut best_rd = std::f64::MAX;
739
740	let do_rdo_tx_size = fi.tx_mode_select
741	&& fi.config.speed_settings.transform.rdo_tx_decision
742	&& !is_inter;
743	let rdo_tx_depth = if do_rdo_tx_size { `2` } else { `0` };
744	let mut cw_checkpoint: Option<ContextWriterCheckpoint> = None;
745
746	for _ in `0`..=rdo_tx_depth {
747	let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set);
748
749	let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY
750	&& fi.config.speed_settings.transform.rdo_tx_decision
751	&& !is_inter
752	&& !skip;
753
754	if !do_rdo_tx_size && !do_rdo_tx_type {
755	return (best_tx_size, best_tx_type);
756	};
757
758	let tx_types =
759	if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] };
760
761	// Luma plane transform type decision
762	let (tx_type, rd_cost) = rdo_tx_type_decision(
763	fi,
764	ts,
765	cw,
766	&mut cw_checkpoint,
767	luma_mode,
768	ref_frames,
769	mvs,
770	bsize,
771	tile_bo,
772	tx_size,
773	tx_set,
774	tx_types,
775	best_rd,
776	);
777
778	if rd_cost < best_rd {
779	best_tx_size = tx_size;
780	best_tx_type = tx_type;
781	best_rd = rd_cost;
782	}
783
784	debug_assert!(tx_size.width_log2() <= bsize.width_log2());
785	debug_assert!(tx_size.height_log2() <= bsize.height_log2());
786	debug_assert!(
787	tx_size.sqr() <= TxSize::TX_32X32 \|\| tx_type == TxType::DCT_DCT
788	);
789
790	let next_tx_size = sub_tx_size_map[tx_size as usize];
791
792	if next_tx_size == tx_size {
793	break;
794	} else {
795	tx_size = next_tx_size;
796	};
797	}
798
799	(best_tx_size, best_tx_type)
800	}
801
802	#[inline]
803	const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool {
804	let diff_row: i32 = mv.row as i32 - ref_mv.row as i32;
805	let diff_col: i32 = mv.col as i32 - ref_mv.col as i32;
806	diff_row >= MV_LOW
807	&& diff_row <= MV_UPP
808	&& diff_col >= MV_LOW
809	&& diff_col <= MV_UPP
810	}
811
812	#[inline]
813	#[profiling::function]
814	fn luma_chroma_mode_rdo<T: Pixel>(
815	luma_mode: PredictionMode, fi: &FrameInvariants<T>, bsize: BlockSize,
816	tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>,
817	cw: &mut ContextWriter, rdo_type: RDOType,
818	cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters,
819	mvs: [MotionVector; `2`], ref_frames: [RefType; `2`],
820	mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool,
821	mode_context: usize, mv_stack: &ArrayVec<CandidateMV, `9`>,
822	angle_delta: AngleDelta,
823	) {
824	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
825
826	let is_chroma_block =
827	has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
828
829	if !luma_mode_is_intra {
830	let ref_mvs = if mv_stack.is_empty() {
831	[MotionVector::default(); `2`]
832	} else {
833	[mv_stack[`0`].this_mv, mv_stack[`0`].comp_mv]
834	};
835
836	if (luma_mode == PredictionMode::NEWMV
837	\|\| luma_mode == PredictionMode::NEW_NEWMV
838	\|\| luma_mode == PredictionMode::NEW_NEARESTMV)
839	&& !dmv_in_range(mvs[`0`], ref_mvs[`0`])
840	{
841	return;
842	}
843
844	if (luma_mode == PredictionMode::NEW_NEWMV
845	\|\| luma_mode == PredictionMode::NEAREST_NEWMV)
846	&& !dmv_in_range(mvs[`1`], ref_mvs[`1`])
847	{
848	return;
849	}
850	}
851
852	// Find the best chroma prediction mode for the current luma prediction mode
853	let mut chroma_rdo = \|skip: bool\| -> bool {
854	use crate::segmentation::select_segment;
855
856	let mut zero_distortion = `false`;
857
858	for sidx in select_segment(fi, ts, tile_bo, bsize, skip) {
859	cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx);
860
861	let (tx_size, tx_type) = rdo_tx_size_type(
862	fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip,
863	);
864	for &chroma_mode in mode_set_chroma.iter() {
865	let wr = &mut WriterCounter::new();
866	let tell = wr.tell_frac();
867
868	if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() {
869	cw.write_partition(
870	wr,
871	tile_bo,
872	PartitionType::PARTITION_NONE,
873	bsize,
874	);
875	}
876
877	// TODO(yushin): luma and chroma would have different decision based on chroma format
878	let need_recon_pixel =
879	luma_mode_is_intra && tx_size.block_size() != bsize;
880
881	encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip);
882	let (has_coeff, tx_dist) = encode_block_post_cdef(
883	fi,
884	ts,
885	cw,
886	wr,
887	luma_mode,
888	chroma_mode,
889	angle_delta,
890	ref_frames,
891	mvs,
892	bsize,
893	tile_bo,
894	skip,
895	CFLParams::default(),
896	tx_size,
897	tx_type,
898	mode_context,
899	mv_stack,
900	rdo_type,
901	need_recon_pixel,
902	None,
903	);
904
905	let rate = wr.tell_frac() - tell;
906	let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel {
907	compute_tx_distortion(
908	fi,
909	ts,
910	bsize,
911	is_chroma_block,
912	tile_bo,
913	tx_dist,
914	skip,
915	`false`,
916	)
917	} else {
918	compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, `false`)
919	};
920	let is_zero_dist = distortion.0 == `0`;
921	let rd = compute_rd_cost(fi, rate, distortion);
922	if rd < best.rd_cost {
923	//if rd < best.rd_cost \|\| luma_mode == PredictionMode::NEW_NEWMV {
924	best.rd_cost = rd;
925	best.pred_mode_luma = luma_mode;
926	best.pred_mode_chroma = chroma_mode;
927	best.angle_delta = angle_delta;
928	best.ref_frames = ref_frames;
929	best.mvs = mvs;
930	best.skip = skip;
931	best.has_coeff = has_coeff;
932	best.tx_size = tx_size;
933	best.tx_type = tx_type;
934	best.sidx = sidx;
935	zero_distortion = is_zero_dist;
936	}
937
938	cw.rollback(cw_checkpoint);
939	}
940	}
941
942	zero_distortion
943	};
944
945	// Don't skip when using intra modes
946	let zero_distortion =
947	if !luma_mode_is_intra { chroma_rdo(`true`) } else { `false` };
948	// early skip
949	if !zero_distortion {
950	chroma_rdo(`false`);
951	}
952	}
953
954	/// RDO-based mode decision
955	///
956	/// # Panics
957	///
958	/// - If the best RD found is negative.
959	/// This should never happen and indicates a development error.
960	#[profiling::function]
961	pub fn rdo_mode_decision<T: Pixel>(
962	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
963	cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
964	inter_cfg: &InterConfig,
965	) -> PartitionParameters {
966	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
967	let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
968
969	let rdo_type = if fi.use_tx_domain_rate {
970	RDOType::TxDistEstRate
971	} else if fi.use_tx_domain_distortion {
972	RDOType::TxDistRealRate
973	} else {
974	RDOType::PixelDistRealRate
975	};
976
977	let mut best = if fi.frame_type.has_inter() {
978	assert!(fi.frame_type != FrameType::KEY);
979
980	inter_frame_rdo_mode_decision(
981	fi,
982	ts,
983	cw,
984	bsize,
985	tile_bo,
986	inter_cfg,
987	&cw_checkpoint,
988	rdo_type,
989	)
990	} else {
991	PartitionParameters::default()
992	};
993
994	let is_chroma_block =
995	has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
996
997	if !best.skip {
998	best = intra_frame_rdo_mode_decision(
999	fi,
1000	ts,
1001	cw,
1002	bsize,
1003	tile_bo,
1004	&cw_checkpoint,
1005	rdo_type,
1006	best,
1007	is_chroma_block,
1008	);
1009	}
1010
1011	if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() {
1012	cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx);
1013
1014	let chroma_mode = PredictionMode::UV_CFL_PRED;
1015	let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1016	let mut wr = WriterCounter::new();
1017	let angle_delta = AngleDelta { y: best.angle_delta.y, uv: `0` };
1018
1019	write_tx_blocks(
1020	fi,
1021	ts,
1022	cw,
1023	&mut wr,
1024	best.pred_mode_luma,
1025	best.pred_mode_luma,
1026	angle_delta,
1027	tile_bo,
1028	bsize,
1029	best.tx_size,
1030	best.tx_type,
1031	`false`,
1032	CFLParams::default(),
1033	`true`,
1034	rdo_type,
1035	`true`,
1036	);
1037	cw.rollback(&cw_checkpoint);
1038	if fi.sequence.chroma_sampling != ChromaSampling::Cs400 {
1039	if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) {
1040	let mut wr = WriterCounter::new();
1041	let tell = wr.tell_frac();
1042
1043	encode_block_pre_cdef(
1044	&fi.sequence,
1045	ts,
1046	cw,
1047	&mut wr,
1048	bsize,
1049	tile_bo,
1050	best.skip,
1051	);
1052	let (has_coeff, _) = encode_block_post_cdef(
1053	fi,
1054	ts,
1055	cw,
1056	&mut wr,
1057	best.pred_mode_luma,
1058	chroma_mode,
1059	angle_delta,
1060	best.ref_frames,
1061	best.mvs,
1062	bsize,
1063	tile_bo,
1064	best.skip,
1065	cfl,
1066	best.tx_size,
1067	best.tx_type,
1068	`0`,
1069	&[],
1070	rdo_type,
1071	`true`, // For CFL, luma should be always reconstructed.
1072	None,
1073	);
1074
1075	let rate = wr.tell_frac() - tell;
1076
1077	// For CFL, tx-domain distortion is not an option.
1078	let distortion =
1079	compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, `false`);
1080	let rd = compute_rd_cost(fi, rate, distortion);
1081	if rd < best.rd_cost {
1082	best.rd_cost = rd;
1083	best.pred_mode_chroma = chroma_mode;
1084	best.angle_delta = angle_delta;
1085	best.has_coeff = has_coeff;
1086	best.pred_cfl_params = cfl;
1087	}
1088
1089	cw.rollback(&cw_checkpoint);
1090	}
1091	}
1092	}
1093
1094	cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma);
1095	cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames);
1096	cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs);
1097
1098	assert!(best.rd_cost >= `0_f64`);
1099
1100	PartitionParameters {
1101	bo: tile_bo,
1102	bsize,
1103	pred_mode_luma: best.pred_mode_luma,
1104	pred_mode_chroma: best.pred_mode_chroma,
1105	pred_cfl_params: best.pred_cfl_params,
1106	angle_delta: best.angle_delta,
1107	ref_frames: best.ref_frames,
1108	mvs: best.mvs,
1109	rd_cost: best.rd_cost,
1110	skip: best.skip,
1111	has_coeff: best.has_coeff,
1112	tx_size: best.tx_size,
1113	tx_type: best.tx_type,
1114	sidx: best.sidx,
1115	}
1116	}
1117
1118	#[profiling::function]
1119	fn inter_frame_rdo_mode_decision<T: Pixel>(
1120	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1121	cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1122	inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint,
1123	rdo_type: RDOType,
1124	) -> PartitionParameters {
1125	let mut best = PartitionParameters::default();
1126
1127	// we can never have more than 7 reference frame sets
1128	let mut ref_frames_set = ArrayVec::<_, `7`>::new();
1129	// again, max of 7 ref slots
1130	let mut ref_slot_set = ArrayVec::<_, `7`>::new();
1131	// our implementation never returns more than 3 at the moment
1132	let mut mvs_from_me = ArrayVec::<_, `3`>::new();
1133	let mut fwdref = None;
1134	let mut bwdref = None;
1135
1136	for i in inter_cfg.allowed_ref_frames().iter().copied() {
1137	// Don't search LAST3 since it's used only for probs
1138	if i == LAST3_FRAME {
1139	continue;
1140	}
1141
1142	if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) {
1143	if fwdref.is_none() && i.is_fwd_ref() {
1144	fwdref = Some(ref_frames_set.len());
1145	}
1146	if bwdref.is_none() && i.is_bwd_ref() {
1147	bwdref = Some(ref_frames_set.len());
1148	}
1149	ref_frames_set.push([i, NONE_FRAME]);
1150	let slot_idx = fi.ref_frames[i.to_index()];
1151	ref_slot_set.push(slot_idx);
1152	}
1153	}
1154	assert!(!ref_frames_set.is_empty());
1155
1156	let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), `20`>::new();
1157	let mut mvs_set = ArrayVec::<[MotionVector; `2`], `20`>::new();
1158	let mut satds = ArrayVec::<u32, `20`>::new();
1159	let mut mv_stacks = ArrayVec::<_, `20`>::new();
1160	let mut mode_contexts = ArrayVec::<_, `7`>::new();
1161
1162	for (i, &ref_frames) in ref_frames_set.iter().enumerate() {
1163	let mut mv_stack = ArrayVec::<CandidateMV, `9`>::new();
1164	mode_contexts.push(cw.find_mvrefs(
1165	tile_bo,
1166	ref_frames,
1167	&mut mv_stack,
1168	bsize,
1169	fi,
1170	`false`,
1171	));
1172
1173	let mut pmv = [MotionVector::default(); `2`];
1174	if !mv_stack.is_empty() {
1175	pmv[`0`] = mv_stack[`0`].this_mv;
1176	}
1177	if mv_stack.len() > `1` {
1178	pmv[`1`] = mv_stack[`1`].this_mv;
1179	}
1180
1181	let res = estimate_motion(
1182	fi,
1183	ts,
1184	bsize.width(),
1185	bsize.height(),
1186	tile_bo,
1187	ref_frames[`0`],
1188	Some(pmv),
1189	MVSamplingMode::CORNER { right: `true`, bottom: `true` },
1190	`false`,
1191	`0`,
1192	None,
1193	)
1194	.unwrap_or_else(MotionSearchResult::empty);
1195	let b_me = res.mv;
1196
1197	mvs_from_me.push([b_me, MotionVector::default()]);
1198
1199	for &x in RAV1E_INTER_MODES_MINIMAL {
1200	inter_mode_set.push((x, i));
1201	}
1202	if !mv_stack.is_empty() {
1203	inter_mode_set.push((PredictionMode::NEAR0MV, i));
1204	}
1205	if mv_stack.len() >= `2` {
1206	inter_mode_set.push((PredictionMode::GLOBALMV, i));
1207	}
1208	let include_near_mvs = fi.config.speed_settings.motion.include_near_mvs;
1209	if include_near_mvs {
1210	if mv_stack.len() >= `3` {
1211	inter_mode_set.push((PredictionMode::NEAR1MV, i));
1212	}
1213	if mv_stack.len() >= `4` {
1214	inter_mode_set.push((PredictionMode::NEAR2MV, i));
1215	}
1216	}
1217	let same_row_col = \|x: &CandidateMV\| {
1218	x.this_mv.row == mvs_from_me[i][`0`].row
1219	&& x.this_mv.col == mvs_from_me[i][`0`].col
1220	};
1221	if !mv_stack
1222	.iter()
1223	.take(if include_near_mvs { `4` } else { `2` })
1224	.any(same_row_col)
1225	&& (mvs_from_me[i][`0`].row != `0` \|\| mvs_from_me[i][`0`].col != `0`)
1226	{
1227	inter_mode_set.push((PredictionMode::NEWMV, i));
1228	}
1229
1230	mv_stacks.push(mv_stack);
1231	}
1232
1233	let sz = bsize.width_mi().min(bsize.height_mi());
1234
1235	// To use non single reference modes, block width and height must be greater than 4.
1236	if fi.reference_mode != ReferenceMode::SINGLE && sz >= `2` {
1237	// Adding compound candidate
1238	if let Some(r0) = fwdref {
1239	if let Some(r1) = bwdref {
1240	let ref_frames = [ref_frames_set[r0][`0`], ref_frames_set[r1][`0`]];
1241	ref_frames_set.push(ref_frames);
1242	let mv0 = mvs_from_me[r0][`0`];
1243	let mv1 = mvs_from_me[r1][`0`];
1244	mvs_from_me.push([mv0, mv1]);
1245	let mut mv_stack = ArrayVec::<CandidateMV, `9`>::new();
1246	mode_contexts.push(cw.find_mvrefs(
1247	tile_bo,
1248	ref_frames,
1249	&mut mv_stack,
1250	bsize,
1251	fi,
1252	`true`,
1253	));
1254	for &x in RAV1E_INTER_COMPOUND_MODES {
1255	// exclude any NEAR mode based on speed setting
1256	if fi.config.speed_settings.motion.include_near_mvs
1257	\|\| !x.has_nearmv()
1258	{
1259	let mv_stack_idx = ref_frames_set.len() - `1`;
1260	// exclude NEAR modes if the mv_stack is too short
1261	if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) {
1262	inter_mode_set.push((x, mv_stack_idx));
1263	}
1264	}
1265	}
1266	mv_stacks.push(mv_stack);
1267	}
1268	}
1269	}
1270
1271	let num_modes_rdo = if fi.config.speed_settings.prediction.prediction_modes
1272	>= PredictionModesSetting::ComplexAll
1273	{
1274	inter_mode_set.len()
1275	} else {
1276	`9` // This number is determined by AWCY test
1277	};
1278
1279	inter_mode_set.iter().for_each(\|&(luma_mode, i)\| {
1280	let mvs = match luma_mode {
1281	PredictionMode::NEWMV \| PredictionMode::NEW_NEWMV => mvs_from_me[i],
1282	PredictionMode::NEARESTMV \| PredictionMode::NEAREST_NEARESTMV => {
1283	if !mv_stacks[i].is_empty() {
1284	[mv_stacks[i][`0`].this_mv, mv_stacks[i][`0`].comp_mv]
1285	} else {
1286	[MotionVector::default(); `2`]
1287	}
1288	}
1289	PredictionMode::NEAR0MV \| PredictionMode::NEAR_NEAR0MV => {
1290	if mv_stacks[i].len() > `1` {
1291	[mv_stacks[i][`1`].this_mv, mv_stacks[i][`1`].comp_mv]
1292	} else {
1293	[MotionVector::default(); `2`]
1294	}
1295	}
1296	PredictionMode::NEAR1MV
1297	\| PredictionMode::NEAR2MV
1298	\| PredictionMode::NEAR_NEAR1MV
1299	\| PredictionMode::NEAR_NEAR2MV => [
1300	mv_stacks[i][luma_mode.ref_mv_idx()].this_mv,
1301	mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv,
1302	],
1303	PredictionMode::NEAREST_NEWMV => {
1304	[mv_stacks[i][`0`].this_mv, mvs_from_me[i][`1`]]
1305	}
1306	PredictionMode::NEW_NEARESTMV => {
1307	[mvs_from_me[i][`0`], mv_stacks[i][`0`].comp_mv]
1308	}
1309	PredictionMode::GLOBALMV \| PredictionMode::GLOBAL_GLOBALMV => {
1310	[MotionVector::default(); `2`]
1311	}
1312	_ => {
1313	unimplemented!();
1314	}
1315	};
1316	mvs_set.push(mvs);
1317
1318	// Calculate SATD for each mode
1319	if num_modes_rdo != inter_mode_set.len() {
1320	let tile_rect = ts.tile_rect();
1321	let rec = &mut ts.rec.planes[`0`];
1322	let po = tile_bo.plane_offset(rec.plane_cfg);
1323	let mut rec_region =
1324	rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1325
1326	luma_mode.predict_inter(
1327	fi,
1328	tile_rect,
1329	`0`,
1330	po,
1331	&mut rec_region,
1332	bsize.width(),
1333	bsize.height(),
1334	ref_frames_set[i],
1335	mvs,
1336	&mut ts.inter_compound_buffers,
1337	);
1338
1339	let plane_org = ts.input_tile.planes[`0`]
1340	.subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1341	let plane_ref = rec_region.as_const();
1342
1343	let satd = get_satd(
1344	&plane_org,
1345	&plane_ref,
1346	bsize.width(),
1347	bsize.height(),
1348	fi.sequence.bit_depth,
1349	fi.cpu_feature_level,
1350	);
1351	satds.push(satd);
1352	} else {
1353	satds.push(`0`);
1354	}
1355	});
1356
1357	let mut sorted =
1358	izip!(inter_mode_set, mvs_set, satds).collect::<ArrayVec<_, `20`>>();
1359	if num_modes_rdo != sorted.len() {
1360	sorted.sort_by_key(\|((_mode, _i), _mvs, satd)\| *satd);
1361	}
1362
1363	sorted.iter().take(num_modes_rdo).for_each(
1364	\|&((luma_mode, i), mvs, _satd)\| {
1365	let mode_set_chroma = ArrayVec::from([luma_mode]);
1366
1367	luma_chroma_mode_rdo(
1368	luma_mode,
1369	fi,
1370	bsize,
1371	tile_bo,
1372	ts,
1373	cw,
1374	rdo_type,
1375	cw_checkpoint,
1376	&mut best,
1377	mvs,
1378	ref_frames_set[i],
1379	&mode_set_chroma,
1380	`false`,
1381	mode_contexts[i],
1382	&mv_stacks[i],
1383	AngleDelta::default(),
1384	);
1385	},
1386	);
1387
1388	best
1389	}
1390
1391	#[profiling::function]
1392	fn intra_frame_rdo_mode_decision<T: Pixel>(
1393	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1394	cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1395	cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType,
1396	mut best: PartitionParameters, is_chroma_block: bool,
1397	) -> PartitionParameters {
1398	let mut modes = ArrayVec::<_, INTRA_MODES>::new();
1399
1400	// Reduce number of prediction modes at higher speed levels
1401	let num_modes_rdo = if (fi.frame_type == FrameType::KEY
1402	&& fi.config.speed_settings.prediction.prediction_modes
1403	>= PredictionModesSetting::ComplexKeyframes)
1404	\|\| (fi.frame_type.has_inter()
1405	&& fi.config.speed_settings.prediction.prediction_modes
1406	>= PredictionModesSetting::ComplexAll)
1407	{
1408	`7`
1409	} else {
1410	`3`
1411	};
1412
1413	let intra_mode_set = RAV1E_INTRA_MODES;
1414
1415	// Find mode with lowest rate cost
1416	{
1417	use crate::ec::cdf_to_pdf;
1418
1419	let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() {
1420	cw.get_cdf_intra_mode(bsize)
1421	} else {
1422	cw.get_cdf_intra_mode_kf(tile_bo)
1423	});
1424
1425	modes.try_extend_from_slice(intra_mode_set).unwrap();
1426	modes.sort_by_key(\|&a\| !probs_all[a as usize]);
1427	}
1428
1429	// If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening
1430	// may be improved by emulating prediction for each tx block.
1431	{
1432	let satds = {
1433	// FIXME: If tx partition is used, this whole sads block should be fixed
1434	let tx_size = bsize.tx_size();
1435	let mut edge_buf = Aligned::uninit_array();
1436	let edge_buf = {
1437	let rec = &ts.rec.planes[`0`].as_const();
1438	let po = tile_bo.plane_offset(rec.plane_cfg);
1439	// FIXME: If tx partition is used, get_intra_edges() should be called for each tx block
1440	get_intra_edges(
1441	&mut edge_buf,
1442	rec,
1443	tile_bo,
1444	`0`,
1445	`0`,
1446	bsize,
1447	po,
1448	tx_size,
1449	fi.sequence.bit_depth,
1450	None,
1451	fi.sequence.enable_intra_edge_filter,
1452	IntraParam::None,
1453	)
1454	};
1455
1456	let ief_params = if fi.sequence.enable_intra_edge_filter {
1457	let above_block_info = ts.above_block_info(tile_bo, `0`, `0`);
1458	let left_block_info = ts.left_block_info(tile_bo, `0`, `0`);
1459	Some(IntraEdgeFilterParameters::new(
1460	`0`,
1461	above_block_info,
1462	left_block_info,
1463	))
1464	} else {
1465	None
1466	};
1467
1468	let mut satds_all = [`0`; INTRA_MODES];
1469	for &luma_mode in modes.iter().skip(num_modes_rdo / `2`) {
1470	let tile_rect = ts.tile_rect();
1471	let rec = &mut ts.rec.planes[`0`];
1472	let mut rec_region =
1473	rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1474	// FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block
1475	luma_mode.predict_intra(
1476	tile_rect,
1477	&mut rec_region,
1478	tx_size,
1479	fi.sequence.bit_depth,
1480	&[`0i16`; `2`],
1481	IntraParam::None,
1482	if luma_mode.is_directional() { ief_params } else { None },
1483	&edge_buf,
1484	fi.cpu_feature_level,
1485	);
1486
1487	let plane_org = ts.input_tile.planes[`0`]
1488	.subregion(Area::BlockStartingAt { bo: tile_bo.0 });
1489	let plane_ref = rec_region.as_const();
1490
1491	satds_all[luma_mode as usize] = get_satd(
1492	&plane_org,
1493	&plane_ref,
1494	tx_size.width(),
1495	tx_size.height(),
1496	fi.sequence.bit_depth,
1497	fi.cpu_feature_level,
1498	);
1499	}
1500	satds_all
1501	};
1502
1503	modes[num_modes_rdo / `2`..].sort_by_key(\|&a\| satds[a as usize]);
1504	}
1505
1506	debug_assert!(num_modes_rdo >= `1`);
1507
1508	modes.iter().take(num_modes_rdo).for_each(\|&luma_mode\| {
1509	let mvs = [MotionVector::default(); `2`];
1510	let ref_frames = [INTRA_FRAME, NONE_FRAME];
1511	let mut mode_set_chroma = ArrayVec::<_, `2`>::new();
1512	mode_set_chroma.push(luma_mode);
1513	if is_chroma_block && luma_mode != PredictionMode::DC_PRED {
1514	mode_set_chroma.push(PredictionMode::DC_PRED);
1515	}
1516	luma_chroma_mode_rdo(
1517	luma_mode,
1518	fi,
1519	bsize,
1520	tile_bo,
1521	ts,
1522	cw,
1523	rdo_type,
1524	cw_checkpoint,
1525	&mut best,
1526	mvs,
1527	ref_frames,
1528	&mode_set_chroma,
1529	`true`,
1530	`0`,
1531	&ArrayVec::<CandidateMV, `9`>::new(),
1532	AngleDelta::default(),
1533	);
1534	});
1535
1536	if fi.config.speed_settings.prediction.fine_directional_intra
1537	&& bsize >= BlockSize::BLOCK_8X8
1538	{
1539	// Find the best angle delta for the current best prediction mode
1540	let luma_deltas = best.pred_mode_luma.angle_delta_count();
1541	let chroma_deltas = best.pred_mode_chroma.angle_delta_count();
1542
1543	let mvs = [MotionVector::default(); `2`];
1544	let ref_frames = [INTRA_FRAME, NONE_FRAME];
1545	let mode_set_chroma = [best.pred_mode_chroma];
1546	let mv_stack = ArrayVec::<_, `9`>::new();
1547	let mut best_angle_delta = best.angle_delta;
1548	let mut angle_delta_rdo = \|y, uv\| -> AngleDelta {
1549	if best.angle_delta.y != y \|\| best.angle_delta.uv != uv {
1550	luma_chroma_mode_rdo(
1551	best.pred_mode_luma,
1552	fi,
1553	bsize,
1554	tile_bo,
1555	ts,
1556	cw,
1557	rdo_type,
1558	cw_checkpoint,
1559	&mut best,
1560	mvs,
1561	ref_frames,
1562	&mode_set_chroma,
1563	`true`,
1564	`0`,
1565	&mv_stack,
1566	AngleDelta { y, uv },
1567	);
1568	}
1569	best.angle_delta
1570	};
1571
1572	for i in `0`..luma_deltas {
1573	let angle_delta_y =
1574	if luma_deltas == `1` { `0` } else { i - MAX_ANGLE_DELTA as i8 };
1575	best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv);
1576	}
1577	for j in `0`..chroma_deltas {
1578	let angle_delta_uv =
1579	if chroma_deltas == `1` { `0` } else { j - MAX_ANGLE_DELTA as i8 };
1580	best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv);
1581	}
1582	}
1583
1584	best
1585	}
1586
1587	/// # Panics
1588	///
1589	/// - If the block size is invalid for subsampling.
1590	#[profiling::function]
1591	pub fn rdo_cfl_alpha<T: Pixel>(
1592	ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize,
1593	luma_tx_size: TxSize, fi: &FrameInvariants<T>,
1594	) -> Option<CFLParams> {
1595	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
1596	let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec);
1597	debug_assert!(
1598	bsize.subsampled_size(xdec, ydec).unwrap() == uv_tx_size.block_size()
1599	);
1600
1601	let frame_bo = ts.to_frame_block_offset(tile_bo);
1602	let (visible_tx_w, visible_tx_h) = clip_visible_bsize(
1603	(fi.width + xdec) >> xdec,
1604	(fi.height + ydec) >> ydec,
1605	uv_tx_size.block_size(),
1606	(frame_bo.0.x << MI_SIZE_LOG2) >> xdec,
1607	(frame_bo.0.y << MI_SIZE_LOG2) >> ydec,
1608	);
1609
1610	if visible_tx_w == `0` \|\| visible_tx_h == `0` {
1611	return None;
1612	};
1613	let mut ac = Aligned::<[MaybeUninit<i16>; `32` * `32`]>::uninit_array();
1614	let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi);
1615	let best_alpha: ArrayVec<i16, `2`> = (`1`..`3`)
1616	.map(\|p\| {
1617	let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg;
1618	let tile_rect = ts.tile_rect().decimated(xdec, ydec);
1619	let rec = &mut ts.rec.planes[p];
1620	let input = &ts.input_tile.planes[p];
1621	let po = tile_bo.plane_offset(rec.plane_cfg);
1622	let mut edge_buf = Aligned::uninit_array();
1623	let edge_buf = get_intra_edges(
1624	&mut edge_buf,
1625	&rec.as_const(),
1626	tile_bo,
1627	`0`,
1628	`0`,
1629	bsize,
1630	po,
1631	uv_tx_size,
1632	fi.sequence.bit_depth,
1633	Some(PredictionMode::UV_CFL_PRED),
1634	fi.sequence.enable_intra_edge_filter,
1635	IntraParam::None,
1636	);
1637	let mut alpha_cost = \|alpha: i16\| -> u64 {
1638	let mut rec_region =
1639	rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 });
1640	PredictionMode::UV_CFL_PRED.predict_intra(
1641	tile_rect,
1642	&mut rec_region,
1643	uv_tx_size,
1644	fi.sequence.bit_depth,
1645	ac,
1646	IntraParam::Alpha(alpha),
1647	None,
1648	&edge_buf,
1649	fi.cpu_feature_level,
1650	);
1651	sse_wxh(
1652	&input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }),
1653	&rec_region.as_const(),
1654	visible_tx_w,
1655	visible_tx_h,
1656	\|_, _\| DistortionScale::default(), // We're not doing RDO here.
1657	fi.sequence.bit_depth,
1658	fi.cpu_feature_level,
1659	)
1660	.0
1661	};
1662	let mut best = (alpha_cost(`0`), `0`);
1663	let mut count = `2`;
1664	for alpha in `1i16`..=`16i16` {
1665	let cost = (alpha_cost(alpha), alpha_cost(-alpha));
1666	if cost.0 < best.0 {
1667	best = (cost.0, alpha);
1668	count += `2`;
1669	}
1670	if cost.1 < best.0 {
1671	best = (cost.1, -alpha);
1672	count += `2`;
1673	}
1674	if count < alpha {
1675	break;
1676	}
1677	}
1678	best.1
1679	})
1680	.collect();
1681
1682	if best_alpha[`0`] == `0` && best_alpha[`1`] == `0` {
1683	None
1684	} else {
1685	Some(CFLParams::from_alpha(best_alpha[`0`], best_alpha[`1`]))
1686	}
1687	}
1688
1689	/// RDO-based transform type decision
1690	/// If `cw_checkpoint` is `None`, a checkpoint for cw's (`ContextWriter`) current
1691	/// state is created and stored for later use.
1692	///
1693	/// # Panics
1694	///
1695	/// - If a writer checkpoint is never created before or within the function.
1696	/// This should never happen and indicates a development error.
1697	/// - If the best RD found is negative.
1698	/// This should never happen and indicates a development error.
1699	pub fn rdo_tx_type_decision<T: Pixel>(
1700	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1701	cw: &mut ContextWriter, cw_checkpoint: &mut Option<ContextWriterCheckpoint>,
1702	mode: PredictionMode, ref_frames: [RefType; `2`], mvs: [MotionVector; `2`],
1703	bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet,
1704	tx_types: &[TxType], cur_best_rd: f64,
1705	) -> (TxType, f64) {
1706	let mut best_type = TxType::DCT_DCT;
1707	let mut best_rd = std::f64::MAX;
1708
1709	let PlaneConfig { xdec, ydec, .. } = ts.input.planes[`1`].cfg;
1710	let is_chroma_block =
1711	has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling);
1712
1713	let is_inter = !mode.is_intra();
1714
1715	if cw_checkpoint.is_none() {
1716	// Only run the first call
1717	// Prevents creating multiple checkpoints for own version of cw
1718	*cw_checkpoint =
1719	Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling));
1720	}
1721
1722	let rdo_type = if fi.use_tx_domain_distortion {
1723	RDOType::TxDistRealRate
1724	} else {
1725	RDOType::PixelDistRealRate
1726	};
1727	let need_recon_pixel = tx_size.block_size() != bsize && !is_inter;
1728
1729	let mut first_iteration = `true`;
1730	for &tx_type in tx_types {
1731	// Skip unsupported transform types
1732	if av1_tx_used[tx_set as usize][tx_type as usize] == `0` {
1733	continue;
1734	}
1735
1736	if is_inter {
1737	motion_compensate(
1738	fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, `true`,
1739	);
1740	}
1741
1742	let mut wr = WriterCounter::new();
1743	let tell = wr.tell_frac();
1744	let (_, tx_dist) = if is_inter {
1745	write_tx_tree(
1746	fi,
1747	ts,
1748	cw,
1749	&mut wr,
1750	mode,
1751	`0`,
1752	tile_bo,
1753	bsize,
1754	tx_size,
1755	tx_type,
1756	`false`,
1757	`true`,
1758	rdo_type,
1759	need_recon_pixel,
1760	)
1761	} else {
1762	write_tx_blocks(
1763	fi,
1764	ts,
1765	cw,
1766	&mut wr,
1767	mode,
1768	mode,
1769	AngleDelta::default(),
1770	tile_bo,
1771	bsize,
1772	tx_size,
1773	tx_type,
1774	`false`,
1775	CFLParams::default(), // Unused.
1776	`true`,
1777	rdo_type,
1778	need_recon_pixel,
1779	)
1780	};
1781
1782	let rate = wr.tell_frac() - tell;
1783	let distortion = if fi.use_tx_domain_distortion {
1784	compute_tx_distortion(
1785	fi,
1786	ts,
1787	bsize,
1788	is_chroma_block,
1789	tile_bo,
1790	tx_dist,
1791	`false`,
1792	`true`,
1793	)
1794	} else {
1795	compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, `true`)
1796	};
1797	cw.rollback(cw_checkpoint.as_ref().unwrap());
1798
1799	let rd = compute_rd_cost(fi, rate, distortion);
1800
1801	if first_iteration {
1802	// We use an optimization to early exit after testing the first
1803	// transform type if the cost is higher than the existing best.
1804	// The idea is that if this transform size is not better than he
1805	// previous size, it is not worth testing remaining modes for this size.
1806	if rd > cur_best_rd {
1807	break;
1808	}
1809	first_iteration = `false`;
1810	}
1811
1812	if rd < best_rd {
1813	best_rd = rd;
1814	best_type = tx_type;
1815	}
1816	}
1817
1818	assert!(best_rd >= `0_f64`);
1819
1820	(best_type, best_rd)
1821	}
1822
1823	pub fn get_sub_partitions(
1824	four_partitions: &[TileBlockOffset; `4`], partition: PartitionType,
1825	) -> ArrayVec<TileBlockOffset, `4`> {
1826	let mut partition_offsets: ArrayVec = ArrayVec::<TileBlockOffset, `4`>::new();
1827
1828	partition_offsets.push(element:four_partitions[`0`]);
1829
1830	if partition == PARTITION_NONE {
1831	return partition_offsets;
1832	}
1833	if partition == PARTITION_VERT \|\| partition == PARTITION_SPLIT {
1834	partition_offsets.push(element:four_partitions[`1`]);
1835	};
1836	if partition == PARTITION_HORZ \|\| partition == PARTITION_SPLIT {
1837	partition_offsets.push(element:four_partitions[`2`]);
1838	};
1839	if partition == PARTITION_SPLIT {
1840	partition_offsets.push(element:four_partitions[`3`]);
1841	};
1842
1843	partition_offsets
1844	}
1845
1846	#[inline(always)]
1847	fn rdo_partition_none<T: Pixel>(
1848	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1849	cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset,
1850	inter_cfg: &InterConfig, child_modes: &mut ArrayVec<PartitionParameters, `4`>,
1851	) -> f64 {
1852	debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1853
1854	let mode: PartitionParameters = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg);
1855	let cost: f64 = mode.rd_cost;
1856
1857	child_modes.push(element:mode);
1858
1859	cost
1860	}
1861
1862	// VERTICAL, HORIZONTAL or simple SPLIT
1863	#[inline(always)]
1864	fn rdo_partition_simple<T: Pixel, W: Writer>(
1865	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1866	cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1867	bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig,
1868	partition: PartitionType, rdo_type: RDOType, best_rd: f64,
1869	child_modes: &mut ArrayVec<PartitionParameters, `4`>,
1870	) -> Option<f64> {
1871	debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height);
1872	let subsize = bsize.subsize(partition).unwrap();
1873
1874	let cost = if bsize >= BlockSize::BLOCK_8X8 {
1875	let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1876	let tell = w.tell_frac();
1877	cw.write_partition(w, tile_bo, partition, bsize);
1878	compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero())
1879	} else {
1880	`0.0`
1881	};
1882
1883	let hbsw = subsize.width_mi(); // Half the block size width in blocks
1884	let hbsh = subsize.height_mi(); // Half the block size height in blocks
1885	let four_partitions = [
1886	tile_bo,
1887	TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }),
1888	TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }),
1889	TileBlockOffset(BlockOffset {
1890	x: tile_bo.0.x + hbsw,
1891	y: tile_bo.0.y + hbsh,
1892	}),
1893	];
1894
1895	let partitions = get_sub_partitions(&four_partitions, partition);
1896
1897	let mut rd_cost_sum = `0.0`;
1898
1899	for offset in partitions {
1900	let hbs = subsize.width_mi() >> `1`;
1901	let has_cols = offset.0.x + hbs < ts.mi_width;
1902	let has_rows = offset.0.y + hbs < ts.mi_height;
1903
1904	if has_cols && has_rows {
1905	let mode_decision =
1906	rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg);
1907
1908	rd_cost_sum += mode_decision.rd_cost;
1909
1910	if fi.enable_early_exit && rd_cost_sum > best_rd {
1911	return None;
1912	}
1913	if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() {
1914	let w: &mut W =
1915	if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef };
1916	cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize);
1917	}
1918	encode_block_with_modes(
1919	fi,
1920	ts,
1921	cw,
1922	w_pre_cdef,
1923	w_post_cdef,
1924	subsize,
1925	offset,
1926	&mode_decision,
1927	rdo_type,
1928	None,
1929	);
1930	child_modes.push(mode_decision);
1931	} else {
1932	//rd_cost_sum += std::f64::MAX;
1933	return None;
1934	}
1935	}
1936
1937	Some(cost + rd_cost_sum)
1938	}
1939
1940	/// RDO-based single level partitioning decision
1941	///
1942	/// # Panics
1943	///
1944	/// - If the best RD found is negative.
1945	/// This should never happen, and indicates a development error.
1946	#[profiling::function]
1947	pub fn rdo_partition_decision<T: Pixel, W: Writer>(
1948	fi: &FrameInvariants<T>, ts: &mut TileStateMut<'_, T>,
1949	cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W,
1950	bsize: BlockSize, tile_bo: TileBlockOffset,
1951	cached_block: &PartitionGroupParameters, partition_types: &[PartitionType],
1952	rdo_type: RDOType, inter_cfg: &InterConfig,
1953	) -> PartitionGroupParameters {
1954	let mut best_partition = cached_block.part_type;
1955	let mut best_rd = cached_block.rd_cost;
1956	let mut best_pred_modes = cached_block.part_modes.clone();
1957
1958	let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling);
1959	let w_pre_checkpoint = w_pre_cdef.checkpoint();
1960	let w_post_checkpoint = w_post_cdef.checkpoint();
1961
1962	for &partition in partition_types {
1963	// Do not re-encode results we already have
1964	if partition == cached_block.part_type {
1965	continue;
1966	}
1967
1968	let mut child_modes = ArrayVec::<_, `4`>::new();
1969
1970	let cost = match partition {
1971	PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => {
1972	Some(rdo_partition_none(
1973	fi,
1974	ts,
1975	cw,
1976	bsize,
1977	tile_bo,
1978	inter_cfg,
1979	&mut child_modes,
1980	))
1981	}
1982	PARTITION_SPLIT \| PARTITION_HORZ \| PARTITION_VERT => {
1983	rdo_partition_simple(
1984	fi,
1985	ts,
1986	cw,
1987	w_pre_cdef,
1988	w_post_cdef,
1989	bsize,
1990	tile_bo,
1991	inter_cfg,
1992	partition,
1993	rdo_type,
1994	best_rd,
1995	&mut child_modes,
1996	)
1997	}
1998	_ => {
1999	unreachable!();
2000	}
2001	};
2002
2003	if let Some(rd) = cost {
2004	if rd < best_rd {
2005	best_rd = rd;
2006	best_partition = partition;
2007	best_pred_modes = child_modes.clone();
2008	}
2009	}
2010	cw.rollback(&cw_checkpoint);
2011	w_pre_cdef.rollback(&w_pre_checkpoint);
2012	w_post_cdef.rollback(&w_post_checkpoint);
2013	}
2014
2015	assert!(best_rd >= `0_f64`);
2016
2017	PartitionGroupParameters {
2018	rd_cost: best_rd,
2019	part_type: best_partition,
2020	part_modes: best_pred_modes,
2021	}
2022	}
2023
2024	#[profiling::function]
2025	fn rdo_loop_plane_error<T: Pixel>(
2026	base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset,
2027	sb_w: usize, sb_h: usize, fi: &FrameInvariants<T>, ts: &TileStateMut<'_, T>,
2028	blocks: &TileBlocks<'_>, test: &Frame<T>, src: &Tile<'_, T>, pli: usize,
2029	) -> ScaledDistortion {
2030	let sb_w_blocks =
2031	if fi.sequence.use_128x128_superblock { `16` } else { `8` } * sb_w;
2032	let sb_h_blocks =
2033	if fi.sequence.use_128x128_superblock { `16` } else { `8` } * sb_h;
2034	// Each direction block is 8x8 in y, potentially smaller if subsampled in chroma
2035	// accumulating in-frame and unpadded
2036	let mut err = Distortion::zero();
2037	for by in `0`..sb_h_blocks {
2038	for bx in `0`..sb_w_blocks {
2039	let loop_bo = offset_sbo.block_offset(bx << `1`, by << `1`);
2040	if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() {
2041	let src_plane = &src.planes[pli];
2042	let test_plane = &test.planes[pli];
2043	let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg;
2044	debug_assert_eq!(xdec, test_plane.cfg.xdec);
2045	debug_assert_eq!(ydec, test_plane.cfg.ydec);
2046
2047	// Unfortunately, our distortion biases are only available via
2048	// Frame-absolute addressing, so we need a block offset
2049	// relative to the full frame origin (not the tile or analysis
2050	// area)
2051	let frame_bo = (base_sbo + offset_sbo).block_offset(bx << `1`, by << `1`);
2052	let bias = distortion_scale(
2053	fi,
2054	ts.to_frame_block_offset(frame_bo),
2055	BlockSize::BLOCK_8X8,
2056	);
2057
2058	let src_region =
2059	src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 });
2060	let test_region =
2061	test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 });
2062
2063	err += if pli == `0` {
2064	// For loop filters, We intentionally use cdef_dist even with
2065	// `--tune Psnr`. Using SSE instead gives no PSNR gain but has a
2066	// significant negative impact on other metrics and visual quality.
2067	RawDistortion(cdef_dist_kernel(
2068	&src_region,
2069	&test_region,
2070	`8`,
2071	`8`,
2072	fi.sequence.bit_depth,
2073	fi.cpu_feature_level,
2074	) as u64)
2075	* bias
2076	} else {
2077	sse_wxh(
2078	&src_region,
2079	&test_region,
2080	`8` >> xdec,
2081	`8` >> ydec,
2082	\|_, _\| bias,
2083	fi.sequence.bit_depth,
2084	fi.cpu_feature_level,
2085	)
2086	};
2087	}
2088	}
2089	}
2090	err * fi.dist_scale[pli]
2091	}
2092
2093	/// Passed in a superblock offset representing the upper left corner of
2094	/// the LRU area we're optimizing. This area covers the largest LRU in
2095	/// any of the present planes, but may consist of a number of
2096	/// superblocks and full, smaller LRUs in the other planes
2097	///
2098	/// # Panics
2099	///
2100	/// - If both CDEF and LRF are disabled.
2101	#[profiling::function]
2102	pub fn rdo_loop_decision<T: Pixel, W: Writer>(
2103	base_sbo: TileSuperBlockOffset, fi: &FrameInvariants<T>,
2104	ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W,
2105	deblock_p: bool,
2106	) {
2107	let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 {
2108	`1`
2109	} else {
2110	MAX_PLANES
2111	};
2112	assert!(fi.sequence.enable_cdef \|\| fi.sequence.enable_restoration);
2113	// Determine area of optimization: Which plane has the largest LRUs?
2114	// How many LRUs for each?
2115	let mut sb_w = `1`; // how many superblocks wide the largest LRU
2116	// is/how many SBs we're processing (same thing)
2117	let mut sb_h = `1`; // how many superblocks wide the largest LRU
2118	// is/how many SBs we're processing (same thing)
2119	let mut lru_w = [`0`; MAX_PLANES]; // how many LRUs we're processing
2120	let mut lru_h = [`0`; MAX_PLANES]; // how many LRUs we're processing
2121	for pli in `0`..planes {
2122	let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2123	let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2124	if sb_w < (`1` << sb_h_shift) {
2125	sb_w = `1` << sb_h_shift;
2126	}
2127	if sb_h < (`1` << sb_v_shift) {
2128	sb_h = `1` << sb_v_shift;
2129	}
2130	}
2131	for pli in `0`..planes {
2132	let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2133	let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2134	lru_w[pli] = sb_w / (`1` << sb_h_shift);
2135	lru_h[pli] = sb_h / (`1` << sb_v_shift);
2136	}
2137
2138	// The superblock width/height determinations may be calling for us
2139	// to compute over superblocks that do not actually exist in the
2140	// frame (off the right or lower edge). Trim sb width/height down
2141	// to actual superblocks. Note that these last superblocks on the
2142	// right/bottom may themselves still span the edge of the frame, but
2143	// they do hold at least some visible pixels.
2144	sb_w = sb_w.min(ts.sb_width - base_sbo.0.x);
2145	sb_h = sb_h.min(ts.sb_height - base_sbo.0.y);
2146
2147	// We have need to know the Y visible pixel limits as well (the
2148	// sb_w/sb_h figures above can be used to determine how many
2149	// allocated pixels, possibly beyond the visible frame, exist).
2150	let crop_w =
2151	fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT);
2152	let crop_h =
2153	fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT);
2154	let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT);
2155	let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT);
2156
2157	// Based on `RestorationState::new`
2158	const MAX_SB_SHIFT: usize = `4`;
2159	const MAX_SB_SIZE: usize = `1` << MAX_SB_SHIFT;
2160	const MAX_LRU_SIZE: usize = MAX_SB_SIZE;
2161
2162	// Static allocation relies on the "minimal LRU area for all N planes" invariant.
2163	let mut best_index = [`-1`; MAX_SB_SIZE * MAX_SB_SIZE];
2164	let mut best_lrf =
2165	[[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2166
2167	// due to imprecision in the reconstruction parameter solver, we
2168	// need to make sure we don't fall into a limit cycle. Track our
2169	// best cost at LRF so that we can break if we get a solution that doesn't
2170	// improve at the reconstruction stage.
2171	let mut best_lrf_cost = [[`-1.0`; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2172
2173	// sub-setted region of the TileBlocks for our working frame area.
2174	// Note that the size of this subset is what signals CDEF as to the
2175	// actual coded size.
2176	let mut tileblocks_subset = cw.bc.blocks.subregion_mut(
2177	base_sbo.block_offset(`0`, `0`).0.x,
2178	base_sbo.block_offset(`0`, `0`).0.y,
2179	sb_w << SUPERBLOCK_TO_BLOCK_SHIFT,
2180	sb_h << SUPERBLOCK_TO_BLOCK_SHIFT,
2181	);
2182
2183	// cdef doesn't run on superblocks that are completely skipped.
2184	// Determine which super blocks are marked as skipped so we can avoid running
2185	// them. If all blocks are skipped, we can avoid some of the overhead related
2186	// to setting up for cdef.
2187	let mut cdef_skip = [`true`; MAX_SB_SIZE * MAX_SB_SIZE];
2188	let mut cdef_skip_all = `true`;
2189	if fi.sequence.enable_cdef {
2190	for sby in `0`..sb_h {
2191	for sbx in `0`..sb_w {
2192	let blocks = tileblocks_subset.subregion(`16` * sbx, `16` * sby, `16`, `16`);
2193	let mut skip = `true`;
2194	for y in `0`..blocks.rows() {
2195	for block in blocks[y].iter() {
2196	skip &= block.skip;
2197	}
2198	}
2199	cdef_skip[sby * MAX_SB_SIZE + sbx] = skip;
2200	cdef_skip_all &= skip;
2201	}
2202	}
2203	}
2204
2205	// Unlike cdef, loop restoration will run regardless of whether blocks are
2206	// skipped or not. At the same time, the most significant improvement will
2207	// generally be from un-skipped blocks, so lru is only performed if there are
2208	// un-skipped blocks.
2209	// This should be the same as `cdef_skip_all`, except when cdef is disabled.
2210	let mut lru_skip_all = `true`;
2211	let mut lru_skip = [[`true`; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2212	if fi.sequence.enable_restoration {
2213	if fi.config.speed_settings.lru_on_skip {
2214	lru_skip_all = `false`;
2215	lru_skip = [[`false`; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE];
2216	} else {
2217	for pli in `0`..planes {
2218	// width, in sb, of an LRU in this plane
2219	let lru_sb_w = `1` << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2220	// height, in sb, of an LRU in this plane
2221	let lru_sb_h = `1` << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2222	for lru_y in `0`..lru_h[pli] {
2223	// number of LRUs vertically
2224	for lru_x in `0`..lru_w[pli] {
2225	// number of LRUs horizontally
2226
2227	let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2228	x: lru_x * lru_sb_w,
2229	y: lru_y * lru_sb_h,
2230	});
2231
2232	if !ts.restoration.has_restoration_unit(
2233	base_sbo + loop_sbo,
2234	pli,
2235	`false`,
2236	) {
2237	continue;
2238	}
2239
2240	let start = loop_sbo.block_offset(`0`, `0`).0;
2241	let size = TileSuperBlockOffset(SuperBlockOffset {
2242	x: lru_sb_w,
2243	y: lru_sb_h,
2244	})
2245	.block_offset(`0`, `0`)
2246	.0;
2247
2248	let blocks =
2249	tileblocks_subset.subregion(start.x, start.y, size.x, size.y);
2250	let mut skip = `true`;
2251	for y in `0`..blocks.rows() {
2252	for block in blocks[y].iter() {
2253	skip &= block.skip;
2254	}
2255	}
2256	lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] = skip;
2257	lru_skip_all &= skip;
2258	}
2259	}
2260	}
2261	}
2262	}
2263
2264	// Return early if all blocks are skipped for lru and cdef.
2265	if lru_skip_all && cdef_skip_all {
2266	return;
2267	}
2268
2269	// Loop filter RDO is an iterative process and we need temporary
2270	// scratch data to hold the results of deblocking, cdef, and the
2271	// loop reconstruction filter so that each can be partially updated
2272	// without recomputing the entire stack. Construct
2273	// largest-LRU-sized frames for each, accounting for padding
2274	// required by deblocking, cdef and [optionally] LR.
2275	let mut rec_subset = ts
2276	.rec
2277	.subregion(Area::BlockRect {
2278	bo: base_sbo.block_offset(`0`, `0`).0,
2279	width: (pixel_w + `7`) >> `3` << `3`,
2280	height: (pixel_h + `7`) >> `3` << `3`,
2281	})
2282	.scratch_copy();
2283
2284	// const, no need to copy, just need the subregion (but do zero the
2285	// origin to match the other copies/new backing frames).
2286	let src_subset = ts
2287	.input_tile
2288	.subregion(Area::BlockRect {
2289	bo: base_sbo.block_offset(`0`, `0`).0,
2290	width: (pixel_w + `7`) >> `3` << `3`,
2291	height: (pixel_h + `7`) >> `3` << `3`,
2292	})
2293	.home();
2294
2295	if deblock_p {
2296	// Find a good deblocking filter solution for the passed in area.
2297	// This is not RDO of deblocking itself, merely a solution to get
2298	// better results from CDEF/LRF RDO.
2299	let deblock_levels = deblock_filter_optimize(
2300	fi,
2301	&rec_subset.as_tile(),
2302	&src_subset,
2303	&tileblocks_subset.as_const(),
2304	crop_w,
2305	crop_h,
2306	);
2307
2308	// Deblock the contents of our reconstruction copy.
2309	if deblock_levels[`0`] != `0` \|\| deblock_levels[`1`] != `0` {
2310	// copy ts.deblock because we need to set some of our own values here
2311	let mut deblock_copy = *ts.deblock;
2312	deblock_copy.levels = deblock_levels;
2313
2314	// finally, deblock the temp frame
2315	deblock_filter_frame(
2316	&deblock_copy,
2317	&mut rec_subset.as_tile_mut(),
2318	&tileblocks_subset.as_const(),
2319	crop_w,
2320	crop_h,
2321	fi.sequence.bit_depth,
2322	planes,
2323	);
2324	}
2325	}
2326
2327	let mut cdef_work =
2328	if !cdef_skip_all { Some(rec_subset.clone()) } else { None };
2329	let mut lrf_work = if !lru_skip_all {
2330	Some(Frame {
2331	planes: {
2332	let new_plane = \|pli: usize\| {
2333	let PlaneConfig { xdec, ydec, width, height, .. } =
2334	rec_subset.planes[pli].cfg;
2335	Plane::new(width, height, xdec, ydec, `0`, `0`)
2336	};
2337	[new_plane(`0`), new_plane(`1`), new_plane(`2`)]
2338	},
2339	})
2340	} else {
2341	None
2342	};
2343
2344	// Precompute directional analysis for CDEF
2345	let cdef_data = {
2346	if cdef_work.is_some() {
2347	Some((
2348	&rec_subset,
2349	cdef_analyze_superblock_range(
2350	fi,
2351	&rec_subset,
2352	&tileblocks_subset.as_const(),
2353	sb_w,
2354	sb_h,
2355	),
2356	))
2357	} else {
2358	None
2359	}
2360	};
2361
2362	// CDEF/LRF decision iteration
2363	// Start with a default of CDEF 0 and RestorationFilter::None
2364	// Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it.
2365	// Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it.
2366	// If LRF choice changed for any plane, repeat until no changes
2367	// Limit iterations and where we break based on speed setting (in the TODO list ;-)
2368	let mut cdef_change = `true`;
2369	let mut lrf_change = `true`;
2370	while cdef_change \|\| lrf_change {
2371	// search for improved cdef indices, superblock by superblock, if cdef is enabled.
2372	if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) =
2373	(&cdef_data, &mut cdef_work.as_mut())
2374	{
2375	for sby in `0`..sb_h {
2376	for sbx in `0`..sb_w {
2377	// determine whether this superblock can be skipped
2378	if cdef_skip[sby * MAX_SB_SIZE + sbx] {
2379	continue;
2380	}
2381
2382	let prev_best_index = best_index[sby * sb_w + sbx];
2383	let mut best_cost = `-1.`;
2384	let mut best_new_index = `-1i8`;
2385
2386	/ offset of the superblock we're currently testing within the larger*
2387	analysis area /*
2388	let loop_sbo =
2389	TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby });
2390
2391	/ cdef index testing loop /
2392	for cdef_index in `0`..(`1` << fi.cdef_bits) {
2393	let mut err = ScaledDistortion::zero();
2394	let mut rate = `0`;
2395
2396	cdef_filter_superblock(
2397	fi,
2398	&rec_subset,
2399	&mut cdef_ref.as_tile_mut(),
2400	&tileblocks_subset.as_const(),
2401	loop_sbo,
2402	cdef_index,
2403	&cdef_dirs[sby * sb_w + sbx],
2404	);
2405	// apply LRF if any
2406	for pli in `0`..planes {
2407	// We need the cropped-to-visible-frame area of this SB
2408	let wh =
2409	if fi.sequence.use_128x128_superblock { `128` } else { `64` };
2410	let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg;
2411	let vis_width = (wh >> xdec).min(
2412	(crop_w >> xdec)
2413	- loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x
2414	as usize,
2415	);
2416	let vis_height = (wh >> ydec).min(
2417	(crop_h >> ydec)
2418	- loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y
2419	as usize,
2420	);
2421	// which LRU are we currently testing against?
2422	if let (Some((lru_x, lru_y)), Some(lrf_ref)) = {
2423	let rp = &ts.restoration.planes[pli];
2424	(
2425	rp.restoration_unit_offset(base_sbo, loop_sbo, `false`),
2426	&mut lrf_work,
2427	)
2428	} {
2429	// We have a valid LRU, apply LRF, compute error
2430	match best_lrf[lru_y * lru_w[pli] + lru_x][pli] {
2431	RestorationFilter::None {} => {
2432	err += rdo_loop_plane_error(
2433	base_sbo,
2434	loop_sbo,
2435	`1`,
2436	`1`,
2437	fi,
2438	ts,
2439	&tileblocks_subset.as_const(),
2440	cdef_ref,
2441	&src_subset,
2442	pli,
2443	);
2444	rate += if fi.sequence.enable_restoration {
2445	cw.fc.count_lrf_switchable(
2446	w,
2447	&ts.restoration.as_const(),
2448	best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2449	pli,
2450	)
2451	} else {
2452	`0` // no relative cost differeneces to different
2453	// CDEF params. If cdef is on, it's a wash.
2454	};
2455	}
2456	RestorationFilter::Sgrproj { set, xqd } => {
2457	// only run on this single superblock
2458	let loop_po =
2459	loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg);
2460	// todo: experiment with borrowing border pixels
2461	// rather than edge-extending. Right now this is
2462	// hard-clipping to the superblock boundary.
2463	setup_integral_image(
2464	&mut ts.integral_buffer,
2465	SOLVE_IMAGE_STRIDE,
2466	vis_width,
2467	vis_height,
2468	vis_width,
2469	vis_height,
2470	&cdef_ref.planes[pli].slice(loop_po),
2471	&cdef_ref.planes[pli].slice(loop_po),
2472	);
2473	sgrproj_stripe_filter(
2474	set,
2475	xqd,
2476	fi,
2477	&ts.integral_buffer,
2478	SOLVE_IMAGE_STRIDE,
2479	&cdef_ref.planes[pli].slice(loop_po),
2480	&mut lrf_ref.planes[pli].region_mut(Area::Rect {
2481	x: loop_po.x,
2482	y: loop_po.y,
2483	width: vis_width,
2484	height: vis_height,
2485	}),
2486	);
2487	err += rdo_loop_plane_error(
2488	base_sbo,
2489	loop_sbo,
2490	`1`,
2491	`1`,
2492	fi,
2493	ts,
2494	&tileblocks_subset.as_const(),
2495	lrf_ref,
2496	&src_subset,
2497	pli,
2498	);
2499	rate += cw.fc.count_lrf_switchable(
2500	w,
2501	&ts.restoration.as_const(),
2502	best_lrf[lru_y * lru_w[pli] + lru_x][pli],
2503	pli,
2504	);
2505	}
2506	RestorationFilter::Wiener { .. } => unreachable!(), // coming soon
2507	}
2508	} else {
2509	// No actual LRU here, compute error directly from CDEF output.
2510	err += rdo_loop_plane_error(
2511	base_sbo,
2512	loop_sbo,
2513	`1`,
2514	`1`,
2515	fi,
2516	ts,
2517	&tileblocks_subset.as_const(),
2518	cdef_ref,
2519	&src_subset,
2520	pli,
2521	);
2522	// no relative cost differeneces to different
2523	// CDEF params. If cdef is on, it's a wash.
2524	// rate += 0;
2525	}
2526	}
2527
2528	let cost = compute_rd_cost(fi, rate, err);
2529	if best_cost < `0.` \|\| cost < best_cost {
2530	best_cost = cost;
2531	best_new_index = cdef_index as i8;
2532	}
2533	}
2534
2535	// Did we change any preexisting choices?
2536	if best_new_index != prev_best_index {
2537	cdef_change = `true`;
2538	best_index[sby * sb_w + sbx] = best_new_index;
2539	tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8);
2540	}
2541
2542	let mut cdef_ref_tm = TileMut::new(
2543	cdef_ref,
2544	TileRect {
2545	x: `0`,
2546	y: `0`,
2547	width: cdef_ref.planes[`0`].cfg.width,
2548	height: cdef_ref.planes[`0`].cfg.height,
2549	},
2550	);
2551
2552	// Keep cdef output up to date; we need it for restoration
2553	// both below and above (padding)
2554	cdef_filter_superblock(
2555	fi,
2556	rec_copy,
2557	&mut cdef_ref_tm,
2558	&tileblocks_subset.as_const(),
2559	loop_sbo,
2560	best_index[sby * sb_w + sbx] as u8,
2561	&cdef_dirs[sby * sb_w + sbx],
2562	);
2563	}
2564	}
2565	}
2566
2567	if !cdef_change {
2568	break;
2569	}
2570	cdef_change = `false`;
2571	lrf_change = `false`;
2572
2573	// search for improved restoration filter parameters if restoration is enabled
2574	if let Some(lrf_ref) = &mut lrf_work.as_mut() {
2575	let lrf_input = if cdef_work.is_some() {
2576	// When CDEF is enabled, we pull from the CDEF output
2577	cdef_work.as_ref().unwrap()
2578	} else {
2579	// When CDEF is disabled, we pull from the [optionally
2580	// deblocked] reconstruction
2581	&rec_subset
2582	};
2583	for pli in `0`..planes {
2584	// Nominal size of LRU in pixels before clipping to visible frame
2585	let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size;
2586	// width, in sb, of an LRU in this plane
2587	let lru_sb_w = `1` << ts.restoration.planes[pli].rp_cfg.sb_h_shift;
2588	// height, in sb, of an LRU in this plane
2589	let lru_sb_h = `1` << ts.restoration.planes[pli].rp_cfg.sb_v_shift;
2590	let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg;
2591	for lru_y in `0`..lru_h[pli] {
2592	// number of LRUs vertically
2593	for lru_x in `0`..lru_w[pli] {
2594	// number of LRUs horizontally
2595
2596	// determine whether this lru should be skipped
2597	if lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] {
2598	continue;
2599	}
2600
2601	let loop_sbo = TileSuperBlockOffset(SuperBlockOffset {
2602	x: lru_x * lru_sb_w,
2603	y: lru_y * lru_sb_h,
2604	});
2605	if ts.restoration.has_restoration_unit(
2606	base_sbo + loop_sbo,
2607	pli,
2608	`false`,
2609	) {
2610	let src_plane = &src_subset.planes[pli]; // uncompressed input for reference
2611	let lrf_in_plane = &lrf_input.planes[pli];
2612	let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg);
2613	let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli];
2614	let mut best_cost =
2615	best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli];
2616
2617	// Check the no filter option
2618	{
2619	let err = rdo_loop_plane_error(
2620	base_sbo,
2621	loop_sbo,
2622	lru_sb_w,
2623	lru_sb_h,
2624	fi,
2625	ts,
2626	&tileblocks_subset.as_const(),
2627	lrf_input,
2628	&src_subset,
2629	pli,
2630	);
2631	let rate = cw.fc.count_lrf_switchable(
2632	w,
2633	&ts.restoration.as_const(),
2634	best_new_lrf,
2635	pli,
2636	);
2637
2638	let cost = compute_rd_cost(fi, rate, err);
2639	// Was this choice actually an improvement?
2640	if best_cost < `0.` \|\| cost < best_cost {
2641	best_cost = cost;
2642	best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2643	best_new_lrf = RestorationFilter::None;
2644	}
2645	}
2646
2647	// Look for a self guided filter
2648	// We need the cropped-to-visible-frame computation area of this LRU
2649	let vis_width = unit_size.min(
2650	(crop_w >> xdec)
2651	- loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize,
2652	);
2653	let vis_height = unit_size.min(
2654	(crop_h >> ydec)
2655	- loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize,
2656	);
2657
2658	// todo: experiment with borrowing border pixels
2659	// rather than edge-extending. Right now this is
2660	// hard-clipping to the superblock boundary.
2661	setup_integral_image(
2662	&mut ts.integral_buffer,
2663	SOLVE_IMAGE_STRIDE,
2664	vis_width,
2665	vis_height,
2666	vis_width,
2667	vis_height,
2668	&lrf_in_plane.slice(lrf_po),
2669	&lrf_in_plane.slice(lrf_po),
2670	);
2671
2672	for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity)
2673	{
2674	let (xqd0, xqd1) = sgrproj_solve(
2675	set,
2676	fi,
2677	&ts.integral_buffer,
2678	&src_plane
2679	.subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }),
2680	&lrf_in_plane.slice(lrf_po),
2681	vis_width,
2682	vis_height,
2683	);
2684	let current_lrf =
2685	RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] };
2686	if let RestorationFilter::Sgrproj { set, xqd } = current_lrf {
2687	sgrproj_stripe_filter(
2688	set,
2689	xqd,
2690	fi,
2691	&ts.integral_buffer,
2692	SOLVE_IMAGE_STRIDE,
2693	&lrf_in_plane.slice(lrf_po),
2694	&mut lrf_ref.planes[pli].region_mut(Area::Rect {
2695	x: lrf_po.x,
2696	y: lrf_po.y,
2697	width: vis_width,
2698	height: vis_height,
2699	}),
2700	);
2701	}
2702	let err = rdo_loop_plane_error(
2703	base_sbo,
2704	loop_sbo,
2705	lru_sb_w,
2706	lru_sb_h,
2707	fi,
2708	ts,
2709	&tileblocks_subset.as_const(),
2710	lrf_ref,
2711	&src_subset,
2712	pli,
2713	);
2714	let rate = cw.fc.count_lrf_switchable(
2715	w,
2716	&ts.restoration.as_const(),
2717	current_lrf,
2718	pli,
2719	);
2720	let cost = compute_rd_cost(fi, rate, err);
2721	if cost < best_cost {
2722	best_cost = cost;
2723	best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost;
2724	best_new_lrf = current_lrf;
2725	}
2726	}
2727
2728	if best_lrf[lru_y * lru_w[pli] + lru_x][pli]
2729	.notequal(best_new_lrf)
2730	{
2731	best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf;
2732	lrf_change = `true`;
2733	if let Some(ru) = ts.restoration.planes[pli]
2734	.restoration_unit_mut(base_sbo + loop_sbo)
2735	{
2736	ru.filter = best_new_lrf;
2737	}
2738	}
2739	}
2740	}
2741	}
2742	}
2743	}
2744	}
2745	}
2746
2747	#[test]
2748	fn estimate_rate_test() {
2749	assert_eq!(estimate_rate(`0`, TxSize::TX_4X4, `0`), RDO_RATE_TABLE[`0`][`0`][`0`]);
2750	}
2751