1// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10use crate::cpu_features::CpuFeatureLevel;
11use crate::util::*;
12
13use super::TxType;
14
15cfg_if::cfg_if! {
16 if #[cfg(nasm_x86_64)] {
17 pub use crate::asm::x86::transform::forward::*;
18 } else if #[cfg(asm_neon)] {
19 pub use crate::asm::aarch64::transform::forward::*;
20 } else {
21 pub use self::rust::*;
22 }
23}
24
25pub mod rust {
26 use super::*;
27 use std::mem::MaybeUninit;
28
29 use crate::transform::forward_shared::*;
30 use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize};
31 use simd_helpers::cold_for_target_arch;
32
33 type TxfmFunc = fn(&mut [i32]);
34
35 impl_1d_tx!();
36
37 impl TxOperations for i32 {
38 fn zero() -> Self {
39 0
40 }
41
42 fn tx_mul<const SHIFT: i32>(self, mul: i32) -> Self {
43 ((self * mul) + (1 << SHIFT >> 1)) >> SHIFT
44 }
45
46 fn rshift1(self) -> Self {
47 (self + i32::from(self < 0)) >> 1
48 }
49
50 fn add(self, b: Self) -> Self {
51 self + b
52 }
53
54 fn sub(self, b: Self) -> Self {
55 self - b
56 }
57
58 fn add_avg(self, b: Self) -> Self {
59 (self + b) >> 1
60 }
61
62 fn sub_avg(self, b: Self) -> Self {
63 (self - b) >> 1
64 }
65 }
66
67 /// # Panics
68 ///
69 /// - If called with an invalid combination of `tx_size` and `tx_type`
70 #[cold_for_target_arch("x86_64")]
71 pub fn forward_transform<T: Coefficient>(
72 input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
73 tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
74 ) {
75 assert!(valid_av1_transform(tx_size, tx_type));
76
77 // Note when assigning txfm_size_col, we use the txfm_size from the
78 // row configuration and vice versa. This is intentionally done to
79 // accurately perform rectangular transforms. When the transform is
80 // rectangular, the number of columns will be the same as the
81 // txfm_size stored in the row cfg struct. It will make no difference
82 // for square transforms.
83 let txfm_size_col = tx_size.width();
84 let txfm_size_row = tx_size.height();
85
86 let mut buf = Aligned::<[MaybeUninit<i32>; 64 * 64]>::uninit_array();
87 let buf = &mut buf.data[..txfm_size_col * txfm_size_row];
88
89 let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
90
91 let txfm_func_col = get_func(cfg.txfm_type_col);
92 let txfm_func_row = get_func(cfg.txfm_type_row);
93
94 // Columns
95 for c in 0..txfm_size_col {
96 let mut col_coeffs = Aligned::<[MaybeUninit<i32>; 64]>::uninit_array();
97 let col_coeffs = &mut col_coeffs.data[..txfm_size_row];
98 if cfg.ud_flip {
99 // flip upside down
100 for r in 0..txfm_size_row {
101 col_coeffs[r]
102 .write((input[(txfm_size_row - r - 1) * stride + c]).into());
103 }
104 } else {
105 for r in 0..txfm_size_row {
106 col_coeffs[r].write((input[r * stride + c]).into());
107 }
108 }
109 // SAFETY: The loops above have initialized all txfm_size_row elements
110 let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) };
111
112 av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]);
113 txfm_func_col(col_coeffs);
114 av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[1]);
115 if cfg.lr_flip {
116 for r in 0..txfm_size_row {
117 // flip from left to right
118 buf[r * txfm_size_col + (txfm_size_col - c - 1)]
119 .write(col_coeffs[r]);
120 }
121 } else {
122 for r in 0..txfm_size_row {
123 buf[r * txfm_size_col + c].write(col_coeffs[r]);
124 }
125 }
126 }
127 // SAFETY: The loops above have initialized the entire buf
128 let buf = unsafe { slice_assume_init_mut(buf) };
129
130 // Rows
131 for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() {
132 txfm_func_row(row_coeffs);
133 av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]);
134
135 // Store output in at most 32x32 chunks so that the first 32x32
136 // coefficients are stored first. When we don't have 64 rows, there is no
137 // change in order. With 64 rows, the chunks are in this order
138 // - First 32 rows and first 32 cols
139 // - Last 32 rows and first 32 cols
140 // - First 32 rows and last 32 cols
141 // - Last 32 rows and last 32 cols
142
143 // Output is grouped into 32x32 chunks so a stride of at most 32 is
144 // used for each chunk.
145 let output_stride = txfm_size_row.min(32);
146
147 // Split the first 32 rows from the last 32 rows
148 let output = &mut output
149 [(r >= 32) as usize * output_stride * txfm_size_col.min(32)..];
150
151 for cg in (0..txfm_size_col).step_by(32) {
152 // Split the first 32 cols from the last 32 cols
153 let output = &mut output[txfm_size_row * cg..];
154
155 for c in 0..txfm_size_col.min(32) {
156 output[c * output_stride + (r & 31)]
157 .write(T::cast_from(row_coeffs[c + cg]));
158 }
159 }
160 }
161 }
162}
163