mc.rs source code [crates/rav1e/src/mc.rs]

1	// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved
2	//
3	// This source code is subject to the terms of the BSD 2 Clause License and
4	// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5	// was not distributed with this source code in the LICENSE file, you can
6	// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7	// Media Patent License 1.0 was not distributed with this source code in the
8	// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10	cfg_if::cfg_if! {
11	if #[cfg(nasm_x86_64)] {
12	pub use crate::asm::x86::mc::*;
13	} else if #[cfg(asm_neon)] {
14	pub use crate::asm::aarch64::mc::*;
15	} else {
16	pub use self::rust::*;
17	}
18	}
19
20	use crate::cpu_features::CpuFeatureLevel;
21	use crate::frame::*;
22	use crate::tiling::*;
23	use crate::util::*;
24
25	use simd_helpers::cold_for_target_arch;
26	use std::ops;
27
28	#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
29	pub struct MotionVector {
30	pub row: i16,
31	pub col: i16,
32	}
33
34	impl MotionVector {
35	#[inline]
36	pub const fn quantize_to_fullpel(self) -> Self {
37	Self { row: (self.row / `8`) * `8`, col: (self.col / `8`) * `8` }
38	}
39
40	#[inline]
41	pub const fn is_zero(self) -> bool {
42	self.row == `0` && self.col == `0`
43	}
44
45	#[inline]
46	pub const fn is_valid(self) -> bool {
47	use crate::context::{MV_LOW, MV_UPP};
48	((MV_LOW as i16) < self.row && self.row < (MV_UPP as i16))
49	&& ((MV_LOW as i16) < self.col && self.col < (MV_UPP as i16))
50	}
51	}
52
53	impl ops::Mul<i16> for MotionVector {
54	type Output = MotionVector;
55
56	#[inline]
57	fn mul(self, rhs: i16) -> MotionVector {
58	MotionVector { row: self.row * rhs, col: self.col * rhs }
59	}
60	}
61
62	impl ops::Mul<u16> for MotionVector {
63	type Output = MotionVector;
64
65	#[inline]
66	fn mul(self, rhs: u16) -> MotionVector {
67	MotionVector { row: self.row * rhs as i16, col: self.col * rhs as i16 }
68	}
69	}
70
71	impl ops::Shr<u8> for MotionVector {
72	type Output = MotionVector;
73
74	#[inline]
75	fn shr(self, rhs: u8) -> MotionVector {
76	MotionVector { row: self.row >> rhs, col: self.col >> rhs }
77	}
78	}
79
80	impl ops::Shl<u8> for MotionVector {
81	type Output = MotionVector;
82
83	#[inline]
84	fn shl(self, rhs: u8) -> MotionVector {
85	MotionVector { row: self.row << rhs, col: self.col << rhs }
86	}
87	}
88
89	impl ops::Add<MotionVector> for MotionVector {
90	type Output = MotionVector;
91
92	#[inline]
93	fn add(self, rhs: MotionVector) -> MotionVector {
94	MotionVector { row: self.row + rhs.row, col: self.col + rhs.col }
95	}
96	}
97
98	#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)]
99	#[allow(unused)]
100	pub enum FilterMode {
101	REGULAR = `0`,
102	SMOOTH = `1`,
103	SHARP = `2`,
104	BILINEAR = `3`,
105	SWITCHABLE = `4`,
106	}
107
108	pub const SUBPEL_FILTER_SIZE: usize = `8`;
109
110	const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; `16`]; `6`] = [
111	[
112	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
113	[`0`, `2`, `-6`, `126`, `8`, `-2`, `0`, `0`],
114	[`0`, `2`, `-10`, `122`, `18`, `-4`, `0`, `0`],
115	[`0`, `2`, `-12`, `116`, `28`, `-8`, `2`, `0`],
116	[`0`, `2`, `-14`, `110`, `38`, `-10`, `2`, `0`],
117	[`0`, `2`, `-14`, `102`, `48`, `-12`, `2`, `0`],
118	[`0`, `2`, `-16`, `94`, `58`, `-12`, `2`, `0`],
119	[`0`, `2`, `-14`, `84`, `66`, `-12`, `2`, `0`],
120	[`0`, `2`, `-14`, `76`, `76`, `-14`, `2`, `0`],
121	[`0`, `2`, `-12`, `66`, `84`, `-14`, `2`, `0`],
122	[`0`, `2`, `-12`, `58`, `94`, `-16`, `2`, `0`],
123	[`0`, `2`, `-12`, `48`, `102`, `-14`, `2`, `0`],
124	[`0`, `2`, `-10`, `38`, `110`, `-14`, `2`, `0`],
125	[`0`, `2`, `-8`, `28`, `116`, `-12`, `2`, `0`],
126	[`0`, `0`, `-4`, `18`, `122`, `-10`, `2`, `0`],
127	[`0`, `0`, `-2`, `8`, `126`, `-6`, `2`, `0`],
128	],
129	[
130	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
131	[`0`, `2`, `28`, `62`, `34`, `2`, `0`, `0`],
132	[`0`, `0`, `26`, `62`, `36`, `4`, `0`, `0`],
133	[`0`, `0`, `22`, `62`, `40`, `4`, `0`, `0`],
134	[`0`, `0`, `20`, `60`, `42`, `6`, `0`, `0`],
135	[`0`, `0`, `18`, `58`, `44`, `8`, `0`, `0`],
136	[`0`, `0`, `16`, `56`, `46`, `10`, `0`, `0`],
137	[`0`, `-2`, `16`, `54`, `48`, `12`, `0`, `0`],
138	[`0`, `-2`, `14`, `52`, `52`, `14`, `-2`, `0`],
139	[`0`, `0`, `12`, `48`, `54`, `16`, `-2`, `0`],
140	[`0`, `0`, `10`, `46`, `56`, `16`, `0`, `0`],
141	[`0`, `0`, `8`, `44`, `58`, `18`, `0`, `0`],
142	[`0`, `0`, `6`, `42`, `60`, `20`, `0`, `0`],
143	[`0`, `0`, `4`, `40`, `62`, `22`, `0`, `0`],
144	[`0`, `0`, `4`, `36`, `62`, `26`, `0`, `0`],
145	[`0`, `0`, `2`, `34`, `62`, `28`, `2`, `0`],
146	],
147	[
148	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
149	[`-2`, `2`, `-6`, `126`, `8`, `-2`, `2`, `0`],
150	[`-2`, `6`, `-12`, `124`, `16`, `-6`, `4`, `-2`],
151	[`-2`, `8`, `-18`, `120`, `26`, `-10`, `6`, `-2`],
152	[`-4`, `10`, `-22`, `116`, `38`, `-14`, `6`, `-2`],
153	[`-4`, `10`, `-22`, `108`, `48`, `-18`, `8`, `-2`],
154	[`-4`, `10`, `-24`, `100`, `60`, `-20`, `8`, `-2`],
155	[`-4`, `10`, `-24`, `90`, `70`, `-22`, `10`, `-2`],
156	[`-4`, `12`, `-24`, `80`, `80`, `-24`, `12`, `-4`],
157	[`-2`, `10`, `-22`, `70`, `90`, `-24`, `10`, `-4`],
158	[`-2`, `8`, `-20`, `60`, `100`, `-24`, `10`, `-4`],
159	[`-2`, `8`, `-18`, `48`, `108`, `-22`, `10`, `-4`],
160	[`-2`, `6`, `-14`, `38`, `116`, `-22`, `10`, `-4`],
161	[`-2`, `6`, `-10`, `26`, `120`, `-18`, `8`, `-2`],
162	[`-2`, `4`, `-6`, `16`, `124`, `-12`, `6`, `-2`],
163	[`0`, `2`, `-2`, `8`, `126`, `-6`, `2`, `-2`],
164	],
165	[
166	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
167	[`0`, `0`, `0`, `120`, `8`, `0`, `0`, `0`],
168	[`0`, `0`, `0`, `112`, `16`, `0`, `0`, `0`],
169	[`0`, `0`, `0`, `104`, `24`, `0`, `0`, `0`],
170	[`0`, `0`, `0`, `96`, `32`, `0`, `0`, `0`],
171	[`0`, `0`, `0`, `88`, `40`, `0`, `0`, `0`],
172	[`0`, `0`, `0`, `80`, `48`, `0`, `0`, `0`],
173	[`0`, `0`, `0`, `72`, `56`, `0`, `0`, `0`],
174	[`0`, `0`, `0`, `64`, `64`, `0`, `0`, `0`],
175	[`0`, `0`, `0`, `56`, `72`, `0`, `0`, `0`],
176	[`0`, `0`, `0`, `48`, `80`, `0`, `0`, `0`],
177	[`0`, `0`, `0`, `40`, `88`, `0`, `0`, `0`],
178	[`0`, `0`, `0`, `32`, `96`, `0`, `0`, `0`],
179	[`0`, `0`, `0`, `24`, `104`, `0`, `0`, `0`],
180	[`0`, `0`, `0`, `16`, `112`, `0`, `0`, `0`],
181	[`0`, `0`, `0`, `8`, `120`, `0`, `0`, `0`],
182	],
183	[
184	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
185	[`0`, `0`, `-4`, `126`, `8`, `-2`, `0`, `0`],
186	[`0`, `0`, `-8`, `122`, `18`, `-4`, `0`, `0`],
187	[`0`, `0`, `-10`, `116`, `28`, `-6`, `0`, `0`],
188	[`0`, `0`, `-12`, `110`, `38`, `-8`, `0`, `0`],
189	[`0`, `0`, `-12`, `102`, `48`, `-10`, `0`, `0`],
190	[`0`, `0`, `-14`, `94`, `58`, `-10`, `0`, `0`],
191	[`0`, `0`, `-12`, `84`, `66`, `-10`, `0`, `0`],
192	[`0`, `0`, `-12`, `76`, `76`, `-12`, `0`, `0`],
193	[`0`, `0`, `-10`, `66`, `84`, `-12`, `0`, `0`],
194	[`0`, `0`, `-10`, `58`, `94`, `-14`, `0`, `0`],
195	[`0`, `0`, `-10`, `48`, `102`, `-12`, `0`, `0`],
196	[`0`, `0`, `-8`, `38`, `110`, `-12`, `0`, `0`],
197	[`0`, `0`, `-6`, `28`, `116`, `-10`, `0`, `0`],
198	[`0`, `0`, `-4`, `18`, `122`, `-8`, `0`, `0`],
199	[`0`, `0`, `-2`, `8`, `126`, `-4`, `0`, `0`],
200	],
201	[
202	[`0`, `0`, `0`, `128`, `0`, `0`, `0`, `0`],
203	[`0`, `0`, `30`, `62`, `34`, `2`, `0`, `0`],
204	[`0`, `0`, `26`, `62`, `36`, `4`, `0`, `0`],
205	[`0`, `0`, `22`, `62`, `40`, `4`, `0`, `0`],
206	[`0`, `0`, `20`, `60`, `42`, `6`, `0`, `0`],
207	[`0`, `0`, `18`, `58`, `44`, `8`, `0`, `0`],
208	[`0`, `0`, `16`, `56`, `46`, `10`, `0`, `0`],
209	[`0`, `0`, `14`, `54`, `48`, `12`, `0`, `0`],
210	[`0`, `0`, `12`, `52`, `52`, `12`, `0`, `0`],
211	[`0`, `0`, `12`, `48`, `54`, `14`, `0`, `0`],
212	[`0`, `0`, `10`, `46`, `56`, `16`, `0`, `0`],
213	[`0`, `0`, `8`, `44`, `58`, `18`, `0`, `0`],
214	[`0`, `0`, `6`, `42`, `60`, `20`, `0`, `0`],
215	[`0`, `0`, `4`, `40`, `62`, `22`, `0`, `0`],
216	[`0`, `0`, `4`, `36`, `62`, `26`, `0`, `0`],
217	[`0`, `0`, `2`, `34`, `62`, `30`, `0`, `0`],
218	],
219	];
220
221	pub(crate) mod rust {
222	use super::*;
223	use num_traits::*;
224
225	unsafe fn run_filter<T: AsPrimitive<i32>>(
226	src: *const T, stride: usize, filter: [i32; `8`],
227	) -> i32 {
228	filter
229	.iter()
230	.enumerate()
231	.map(\|(i, f)\| {
232	let p = src.add(i * stride);
233	f * (*p).as_()
234	})
235	.sum::<i32>()
236	}
237
238	fn get_filter(
239	mode: FilterMode, frac: i32, length: usize,
240	) -> [i32; SUBPEL_FILTER_SIZE] {
241	let filter_idx = if mode == FilterMode::BILINEAR \|\| length > `4` {
242	mode as usize
243	} else {
244	(mode as usize).min(`1`) + `4`
245	};
246	SUBPEL_FILTERS[filter_idx][frac as usize]
247	}
248
249	#[cold_for_target_arch("x86_64")]
250	pub fn put_8tap<T: Pixel>(
251	dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize,
252	height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode,
253	mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel,
254	) {
255	// The assembly only supports even heights and valid uncropped widths
256	assert_eq!(height & `1`, `0`);
257	assert!(width.is_power_of_two() && (`2`..=`128`).contains(&width));
258
259	let ref_stride = src.plane.cfg.stride;
260	let y_filter = get_filter(mode_y, row_frac, height);
261	let x_filter = get_filter(mode_x, col_frac, width);
262	let max_sample_val = (`1` << bit_depth) - `1`;
263	let intermediate_bits = `4` - if bit_depth == `12` { `2` } else { `0` };
264	match (col_frac, row_frac) {
265	(`0`, `0`) => {
266	for r in `0`..height {
267	let src_slice = &src[r];
268	let dst_slice = &mut dst[r];
269	dst_slice[..width].copy_from_slice(&src_slice[..width]);
270	}
271	}
272	(`0`, _) => {
273	let offset_slice = src.go_up(`3`);
274	for r in `0`..height {
275	let src_slice = &offset_slice[r];
276	let dst_slice = &mut dst[r];
277	for c in `0`..width {
278	dst_slice[c] = T::cast_from(
279	round_shift(
280	// SAFETY: We pass this a raw pointer, but it's created from a
281	// checked slice, so we are safe.
282	unsafe {
283	run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter)
284	},
285	`7`,
286	)
287	.clamp(`0`, max_sample_val),
288	);
289	}
290	}
291	}
292	(_, `0`) => {
293	let offset_slice = src.go_left(`3`);
294	for r in `0`..height {
295	let src_slice = &offset_slice[r];
296	let dst_slice = &mut dst[r];
297	for c in `0`..width {
298	dst_slice[c] = T::cast_from(
299	round_shift(
300	round_shift(
301	// SAFETY: We pass this a raw pointer, but it's created from a
302	// checked slice, so we are safe.
303	unsafe { run_filter(src_slice[c..].as_ptr(), `1`, x_filter) },
304	`7` - intermediate_bits,
305	),
306	intermediate_bits,
307	)
308	.clamp(`0`, max_sample_val),
309	);
310	}
311	}
312	}
313	(_, _) => {
314	let mut intermediate: [i16; `8` * (`128` + `7`)] = [`0`; `8` * (`128` + `7`)];
315
316	let offset_slice = src.go_left(`3`).go_up(`3`);
317	for cg in (`0`..width).step_by(`8`) {
318	for r in `0`..height + `7` {
319	let src_slice = &offset_slice[r];
320	for c in cg..(cg + `8`).min(width) {
321	intermediate[`8` * r + (c - cg)] = round_shift(
322	// SAFETY: We pass this a raw pointer, but it's created from a
323	// checked slice, so we are safe.
324	unsafe { run_filter(src_slice[c..].as_ptr(), `1`, x_filter) },
325	`7` - intermediate_bits,
326	) as i16;
327	}
328	}
329
330	for r in `0`..height {
331	let dst_slice = &mut dst[r];
332	for c in cg..(cg + `8`).min(width) {
333	dst_slice[c] = T::cast_from(
334	round_shift(
335	// SAFETY: We pass this a raw pointer, but it's created from a
336	// checked slice, so we are safe.
337	unsafe {
338	run_filter(
339	intermediate[`8` * r + c - cg..].as_ptr(),
340	`8`,
341	y_filter,
342	)
343	},
344	`7` + intermediate_bits,
345	)
346	.clamp(`0`, max_sample_val),
347	);
348	}
349	}
350	}
351	}
352	}
353	}
354
355	// HBD output interval is [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
356	// Subtract PREP_BIAS to ensure result fits in i16 and matches dav1d assembly
357	const PREP_BIAS: i32 = `8192`;
358
359	#[cold_for_target_arch("x86_64")]
360	pub fn prep_8tap<T: Pixel>(
361	tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize,
362	col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode,
363	bit_depth: usize, _cpu: CpuFeatureLevel,
364	) {
365	// The assembly only supports even heights and valid uncropped widths
366	assert_eq!(height & `1`, `0`);
367	assert!(width.is_power_of_two() && (`2`..=`128`).contains(&width));
368
369	let ref_stride = src.plane.cfg.stride;
370	let y_filter = get_filter(mode_y, row_frac, height);
371	let x_filter = get_filter(mode_x, col_frac, width);
372	let intermediate_bits = `4` - if bit_depth == `12` { `2` } else { `0` };
373	let prep_bias = if bit_depth == `8` { `0` } else { PREP_BIAS };
374	match (col_frac, row_frac) {
375	(`0`, `0`) => {
376	for r in `0`..height {
377	let src_slice = &src[r];
378	for c in `0`..width {
379	tmp[r * width + c] = (i16::cast_from(src_slice[c])
380	<< intermediate_bits)
381	- prep_bias as i16;
382	}
383	}
384	}
385	(`0`, _) => {
386	let offset_slice = src.go_up(`3`);
387	for r in `0`..height {
388	let src_slice = &offset_slice[r];
389	for c in `0`..width {
390	tmp[r * width + c] = (round_shift(
391	// SAFETY: We pass this a raw pointer, but it's created from a
392	// checked slice, so we are safe.
393	unsafe {
394	run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter)
395	},
396	`7` - intermediate_bits,
397	) - prep_bias) as i16;
398	}
399	}
400	}
401	(_, `0`) => {
402	let offset_slice = src.go_left(`3`);
403	for r in `0`..height {
404	let src_slice = &offset_slice[r];
405	for c in `0`..width {
406	tmp[r * width + c] = (round_shift(
407	// SAFETY: We pass this a raw pointer, but it's created from a
408	// checked slice, so we are safe.
409	unsafe { run_filter(src_slice[c..].as_ptr(), `1`, x_filter) },
410	`7` - intermediate_bits,
411	) - prep_bias) as i16;
412	}
413	}
414	}
415	(_, _) => {
416	let mut intermediate: [i16; `8` * (`128` + `7`)] = [`0`; `8` * (`128` + `7`)];
417
418	let offset_slice = src.go_left(`3`).go_up(`3`);
419	for cg in (`0`..width).step_by(`8`) {
420	for r in `0`..height + `7` {
421	let src_slice = &offset_slice[r];
422	for c in cg..(cg + `8`).min(width) {
423	intermediate[`8` * r + (c - cg)] = round_shift(
424	// SAFETY: We pass this a raw pointer, but it's created from a
425	// checked slice, so we are safe.
426	unsafe { run_filter(src_slice[c..].as_ptr(), `1`, x_filter) },
427	`7` - intermediate_bits,
428	) as i16;
429	}
430	}
431
432	for r in `0`..height {
433	for c in cg..(cg + `8`).min(width) {
434	tmp[r * width + c] = (round_shift(
435	// SAFETY: We pass this a raw pointer, but it's created from a
436	// checked slice, so we are safe.
437	unsafe {
438	run_filter(
439	intermediate[`8` * r + c - cg..].as_ptr(),
440	`8`,
441	y_filter,
442	)
443	},
444	`7`,
445	) - prep_bias) as i16;
446	}
447	}
448	}
449	}
450	}
451	}
452
453	#[cold_for_target_arch("x86_64")]
454	pub fn mc_avg<T: Pixel>(
455	dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize,
456	height: usize, bit_depth: usize, _cpu: CpuFeatureLevel,
457	) {
458	// The assembly only supports even heights and valid uncropped widths
459	assert_eq!(height & `1`, `0`);
460	assert!(width.is_power_of_two() && (`2`..=`128`).contains(&width));
461
462	let max_sample_val = (`1` << bit_depth) - `1`;
463	let intermediate_bits = `4` - if bit_depth == `12` { `2` } else { `0` };
464	let prep_bias = if bit_depth == `8` { `0` } else { PREP_BIAS * `2` };
465	for r in `0`..height {
466	let dst_slice = &mut dst[r];
467	for c in `0`..width {
468	dst_slice[c] = T::cast_from(
469	round_shift(
470	tmp1[r * width + c] as i32
471	+ tmp2[r * width + c] as i32
472	+ prep_bias,
473	intermediate_bits + `1`,
474	)
475	.clamp(`0`, max_sample_val),
476	);
477	}
478	}
479	}
480	}
481