1 | // Copyright (c) 2018-2022, The rav1e contributors. All rights reserved |
2 | // |
3 | // This source code is subject to the terms of the BSD 2 Clause License and |
4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
5 | // was not distributed with this source code in the LICENSE file, you can |
6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
7 | // Media Patent License 1.0 was not distributed with this source code in the |
8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
9 | |
10 | use crate::cpu_features::CpuFeatureLevel; |
11 | use crate::util::*; |
12 | |
13 | use super::TxType; |
14 | |
15 | cfg_if::cfg_if! { |
16 | if #[cfg(nasm_x86_64)] { |
17 | pub use crate::asm::x86::transform::forward::*; |
18 | } else if #[cfg(asm_neon)] { |
19 | pub use crate::asm::aarch64::transform::forward::*; |
20 | } else { |
21 | pub use self::rust::*; |
22 | } |
23 | } |
24 | |
25 | pub mod rust { |
26 | use super::*; |
27 | use std::mem::MaybeUninit; |
28 | |
29 | use crate::transform::forward_shared::*; |
30 | use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize}; |
31 | use simd_helpers::cold_for_target_arch; |
32 | |
33 | type TxfmFunc = fn(&mut [i32]); |
34 | |
35 | impl_1d_tx!(); |
36 | |
37 | impl TxOperations for i32 { |
38 | fn zero() -> Self { |
39 | 0 |
40 | } |
41 | |
42 | fn tx_mul<const SHIFT: i32>(self, mul: i32) -> Self { |
43 | ((self * mul) + (1 << SHIFT >> 1)) >> SHIFT |
44 | } |
45 | |
46 | fn rshift1(self) -> Self { |
47 | (self + i32::from(self < 0)) >> 1 |
48 | } |
49 | |
50 | fn add(self, b: Self) -> Self { |
51 | self + b |
52 | } |
53 | |
54 | fn sub(self, b: Self) -> Self { |
55 | self - b |
56 | } |
57 | |
58 | fn add_avg(self, b: Self) -> Self { |
59 | (self + b) >> 1 |
60 | } |
61 | |
62 | fn sub_avg(self, b: Self) -> Self { |
63 | (self - b) >> 1 |
64 | } |
65 | } |
66 | |
67 | /// # Panics |
68 | /// |
69 | /// - If called with an invalid combination of `tx_size` and `tx_type` |
70 | #[cold_for_target_arch ("x86_64" )] |
71 | pub fn forward_transform<T: Coefficient>( |
72 | input: &[i16], output: &mut [MaybeUninit<T>], stride: usize, |
73 | tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, |
74 | ) { |
75 | assert!(valid_av1_transform(tx_size, tx_type)); |
76 | |
77 | // Note when assigning txfm_size_col, we use the txfm_size from the |
78 | // row configuration and vice versa. This is intentionally done to |
79 | // accurately perform rectangular transforms. When the transform is |
80 | // rectangular, the number of columns will be the same as the |
81 | // txfm_size stored in the row cfg struct. It will make no difference |
82 | // for square transforms. |
83 | let txfm_size_col = tx_size.width(); |
84 | let txfm_size_row = tx_size.height(); |
85 | |
86 | let mut buf = Aligned::<[MaybeUninit<i32>; 64 * 64]>::uninit_array(); |
87 | let buf = &mut buf.data[..txfm_size_col * txfm_size_row]; |
88 | |
89 | let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); |
90 | |
91 | let txfm_func_col = get_func(cfg.txfm_type_col); |
92 | let txfm_func_row = get_func(cfg.txfm_type_row); |
93 | |
94 | // Columns |
95 | for c in 0..txfm_size_col { |
96 | let mut col_coeffs = Aligned::<[MaybeUninit<i32>; 64]>::uninit_array(); |
97 | let col_coeffs = &mut col_coeffs.data[..txfm_size_row]; |
98 | if cfg.ud_flip { |
99 | // flip upside down |
100 | for r in 0..txfm_size_row { |
101 | col_coeffs[r] |
102 | .write((input[(txfm_size_row - r - 1) * stride + c]).into()); |
103 | } |
104 | } else { |
105 | for r in 0..txfm_size_row { |
106 | col_coeffs[r].write((input[r * stride + c]).into()); |
107 | } |
108 | } |
109 | // SAFETY: The loops above have initialized all txfm_size_row elements |
110 | let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) }; |
111 | |
112 | av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]); |
113 | txfm_func_col(col_coeffs); |
114 | av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[1]); |
115 | if cfg.lr_flip { |
116 | for r in 0..txfm_size_row { |
117 | // flip from left to right |
118 | buf[r * txfm_size_col + (txfm_size_col - c - 1)] |
119 | .write(col_coeffs[r]); |
120 | } |
121 | } else { |
122 | for r in 0..txfm_size_row { |
123 | buf[r * txfm_size_col + c].write(col_coeffs[r]); |
124 | } |
125 | } |
126 | } |
127 | // SAFETY: The loops above have initialized the entire buf |
128 | let buf = unsafe { slice_assume_init_mut(buf) }; |
129 | |
130 | // Rows |
131 | for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() { |
132 | txfm_func_row(row_coeffs); |
133 | av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]); |
134 | |
135 | // Store output in at most 32x32 chunks so that the first 32x32 |
136 | // coefficients are stored first. When we don't have 64 rows, there is no |
137 | // change in order. With 64 rows, the chunks are in this order |
138 | // - First 32 rows and first 32 cols |
139 | // - Last 32 rows and first 32 cols |
140 | // - First 32 rows and last 32 cols |
141 | // - Last 32 rows and last 32 cols |
142 | |
143 | // Output is grouped into 32x32 chunks so a stride of at most 32 is |
144 | // used for each chunk. |
145 | let output_stride = txfm_size_row.min(32); |
146 | |
147 | // Split the first 32 rows from the last 32 rows |
148 | let output = &mut output |
149 | [(r >= 32) as usize * output_stride * txfm_size_col.min(32)..]; |
150 | |
151 | for cg in (0..txfm_size_col).step_by(32) { |
152 | // Split the first 32 cols from the last 32 cols |
153 | let output = &mut output[txfm_size_row * cg..]; |
154 | |
155 | for c in 0..txfm_size_col.min(32) { |
156 | output[c * output_stride + (r & 31)] |
157 | .write(T::cast_from(row_coeffs[c + cg])); |
158 | } |
159 | } |
160 | } |
161 | } |
162 | } |
163 | |