| 1 | // Copyright (c) 2018-2022, The rav1e contributors. All rights reserved |
| 2 | // |
| 3 | // This source code is subject to the terms of the BSD 2 Clause License and |
| 4 | // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| 5 | // was not distributed with this source code in the LICENSE file, you can |
| 6 | // obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| 7 | // Media Patent License 1.0 was not distributed with this source code in the |
| 8 | // PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| 9 | |
| 10 | use crate::cpu_features::CpuFeatureLevel; |
| 11 | use crate::util::*; |
| 12 | |
| 13 | use super::TxType; |
| 14 | |
| 15 | cfg_if::cfg_if! { |
| 16 | if #[cfg(nasm_x86_64)] { |
| 17 | pub use crate::asm::x86::transform::forward::*; |
| 18 | } else if #[cfg(asm_neon)] { |
| 19 | pub use crate::asm::aarch64::transform::forward::*; |
| 20 | } else { |
| 21 | pub use self::rust::*; |
| 22 | } |
| 23 | } |
| 24 | |
| 25 | pub mod rust { |
| 26 | use super::*; |
| 27 | use std::mem::MaybeUninit; |
| 28 | |
| 29 | use crate::transform::forward_shared::*; |
| 30 | use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize}; |
| 31 | use simd_helpers::cold_for_target_arch; |
| 32 | |
| 33 | type TxfmFunc = fn(&mut [i32]); |
| 34 | |
| 35 | impl_1d_tx!(); |
| 36 | |
| 37 | impl TxOperations for i32 { |
| 38 | fn zero() -> Self { |
| 39 | 0 |
| 40 | } |
| 41 | |
| 42 | fn tx_mul<const SHIFT: i32>(self, mul: i32) -> Self { |
| 43 | ((self * mul) + (1 << SHIFT >> 1)) >> SHIFT |
| 44 | } |
| 45 | |
| 46 | fn rshift1(self) -> Self { |
| 47 | (self + i32::from(self < 0)) >> 1 |
| 48 | } |
| 49 | |
| 50 | fn add(self, b: Self) -> Self { |
| 51 | self + b |
| 52 | } |
| 53 | |
| 54 | fn sub(self, b: Self) -> Self { |
| 55 | self - b |
| 56 | } |
| 57 | |
| 58 | fn add_avg(self, b: Self) -> Self { |
| 59 | (self + b) >> 1 |
| 60 | } |
| 61 | |
| 62 | fn sub_avg(self, b: Self) -> Self { |
| 63 | (self - b) >> 1 |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /// # Panics |
| 68 | /// |
| 69 | /// - If called with an invalid combination of `tx_size` and `tx_type` |
| 70 | #[cold_for_target_arch ("x86_64" )] |
| 71 | pub fn forward_transform<T: Coefficient>( |
| 72 | input: &[i16], output: &mut [MaybeUninit<T>], stride: usize, |
| 73 | tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, |
| 74 | ) { |
| 75 | assert!(valid_av1_transform(tx_size, tx_type)); |
| 76 | |
| 77 | // Note when assigning txfm_size_col, we use the txfm_size from the |
| 78 | // row configuration and vice versa. This is intentionally done to |
| 79 | // accurately perform rectangular transforms. When the transform is |
| 80 | // rectangular, the number of columns will be the same as the |
| 81 | // txfm_size stored in the row cfg struct. It will make no difference |
| 82 | // for square transforms. |
| 83 | let txfm_size_col = tx_size.width(); |
| 84 | let txfm_size_row = tx_size.height(); |
| 85 | |
| 86 | let mut buf = Aligned::<[MaybeUninit<i32>; 64 * 64]>::uninit_array(); |
| 87 | let buf = &mut buf.data[..txfm_size_col * txfm_size_row]; |
| 88 | |
| 89 | let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); |
| 90 | |
| 91 | let txfm_func_col = get_func(cfg.txfm_type_col); |
| 92 | let txfm_func_row = get_func(cfg.txfm_type_row); |
| 93 | |
| 94 | // Columns |
| 95 | for c in 0..txfm_size_col { |
| 96 | let mut col_coeffs = Aligned::<[MaybeUninit<i32>; 64]>::uninit_array(); |
| 97 | let col_coeffs = &mut col_coeffs.data[..txfm_size_row]; |
| 98 | if cfg.ud_flip { |
| 99 | // flip upside down |
| 100 | for r in 0..txfm_size_row { |
| 101 | col_coeffs[r] |
| 102 | .write((input[(txfm_size_row - r - 1) * stride + c]).into()); |
| 103 | } |
| 104 | } else { |
| 105 | for r in 0..txfm_size_row { |
| 106 | col_coeffs[r].write((input[r * stride + c]).into()); |
| 107 | } |
| 108 | } |
| 109 | // SAFETY: The loops above have initialized all txfm_size_row elements |
| 110 | let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) }; |
| 111 | |
| 112 | av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]); |
| 113 | txfm_func_col(col_coeffs); |
| 114 | av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[1]); |
| 115 | if cfg.lr_flip { |
| 116 | for r in 0..txfm_size_row { |
| 117 | // flip from left to right |
| 118 | buf[r * txfm_size_col + (txfm_size_col - c - 1)] |
| 119 | .write(col_coeffs[r]); |
| 120 | } |
| 121 | } else { |
| 122 | for r in 0..txfm_size_row { |
| 123 | buf[r * txfm_size_col + c].write(col_coeffs[r]); |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | // SAFETY: The loops above have initialized the entire buf |
| 128 | let buf = unsafe { slice_assume_init_mut(buf) }; |
| 129 | |
| 130 | // Rows |
| 131 | for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() { |
| 132 | txfm_func_row(row_coeffs); |
| 133 | av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]); |
| 134 | |
| 135 | // Store output in at most 32x32 chunks so that the first 32x32 |
| 136 | // coefficients are stored first. When we don't have 64 rows, there is no |
| 137 | // change in order. With 64 rows, the chunks are in this order |
| 138 | // - First 32 rows and first 32 cols |
| 139 | // - Last 32 rows and first 32 cols |
| 140 | // - First 32 rows and last 32 cols |
| 141 | // - Last 32 rows and last 32 cols |
| 142 | |
| 143 | // Output is grouped into 32x32 chunks so a stride of at most 32 is |
| 144 | // used for each chunk. |
| 145 | let output_stride = txfm_size_row.min(32); |
| 146 | |
| 147 | // Split the first 32 rows from the last 32 rows |
| 148 | let output = &mut output |
| 149 | [(r >= 32) as usize * output_stride * txfm_size_col.min(32)..]; |
| 150 | |
| 151 | for cg in (0..txfm_size_col).step_by(32) { |
| 152 | // Split the first 32 cols from the last 32 cols |
| 153 | let output = &mut output[txfm_size_row * cg..]; |
| 154 | |
| 155 | for c in 0..txfm_size_col.min(32) { |
| 156 | output[c * output_stride + (r & 31)] |
| 157 | .write(T::cast_from(row_coeffs[c + cg])); |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | |