1// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved
2//
3// This source code is subject to the terms of the BSD 2 Clause License and
4// the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5// was not distributed with this source code in the LICENSE file, you can
6// obtain it at www.aomedia.org/license/software. If the Alliance for Open
7// Media Patent License 1.0 was not distributed with this source code in the
8// PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9
10cfg_if::cfg_if! {
11 if #[cfg(nasm_x86_64)] {
12 pub use crate::asm::x86::transform::inverse::*;
13 } else if #[cfg(asm_neon)] {
14 pub use crate::asm::aarch64::transform::inverse::*;
15 } else {
16 pub use self::rust::*;
17 }
18}
19
20use crate::tiling::PlaneRegionMut;
21use crate::util::*;
22
23// TODO: move 1d txfm code to rust module.
24
25use super::clamp_value;
26use super::consts::*;
27use super::get_1d_tx_types;
28use super::get_rect_tx_log_ratio;
29use super::half_btf;
30use super::TxSize;
31use super::TxType;
32
33/// # Panics
34///
35/// - If `input` or `output` have fewer than 4 items.
36pub fn av1_iwht4(input: &[i32], output: &mut [i32], _range: usize) {
37 assert!(input.len() >= 4);
38 assert!(output.len() >= 4);
39
40 // <https://aomediacodec.github.io/av1-spec/#inverse-walsh-hadamard-transform-process>
41 let x0: i32 = input[0];
42 let x1: i32 = input[1];
43 let x2: i32 = input[2];
44 let x3: i32 = input[3];
45 let s0: i32 = x0 + x1;
46 let s2: i32 = x2 - x3;
47 let s4: i32 = (s0 - s2) >> 1;
48 let s3: i32 = s4 - x3;
49 let s1: i32 = s4 - x1;
50 output[0] = s0 - s3;
51 output[1] = s3;
52 output[2] = s1;
53 output[3] = s2 + s1;
54}
55
56static COSPI_INV: [i32; 64] = [
57 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973, 3948,
58 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564, 3513, 3461,
59 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896, 2824, 2751, 2675,
60 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019, 1931, 1842, 1751, 1660,
61 1567, 1474, 1380, 1285, 1189, 1092, 995, 897, 799, 700, 601, 501, 401, 301,
62 201, 101,
63];
64
65static SINPI_INV: [i32; 5] = [0, 1321, 2482, 3344, 3803];
66
67const INV_COS_BIT: usize = 12;
68
69/// # Panics
70///
71/// - If `input` or `output` have fewer than 4 items.
72pub fn av1_idct4(input: &[i32], output: &mut [i32], range: usize) {
73 assert!(input.len() >= 4);
74 assert!(output.len() >= 4);
75
76 // stage 1
77 let stg1: [i32; 4] = [input[0], input[2], input[1], input[3]];
78
79 // stage 2
80 let stg2: [i32; 4] = [
81 half_btf(w0:COSPI_INV[32], in0:stg1[0], w1:COSPI_INV[32], in1:stg1[1], INV_COS_BIT),
82 half_btf(w0:COSPI_INV[32], in0:stg1[0], -COSPI_INV[32], in1:stg1[1], INV_COS_BIT),
83 half_btf(w0:COSPI_INV[48], in0:stg1[2], -COSPI_INV[16], in1:stg1[3], INV_COS_BIT),
84 half_btf(w0:COSPI_INV[16], in0:stg1[2], w1:COSPI_INV[48], in1:stg1[3], INV_COS_BIT),
85 ];
86
87 // stage 3
88 output[0] = clamp_value(value:stg2[0] + stg2[3], bit:range);
89 output[1] = clamp_value(value:stg2[1] + stg2[2], bit:range);
90 output[2] = clamp_value(value:stg2[1] - stg2[2], bit:range);
91 output[3] = clamp_value(value:stg2[0] - stg2[3], bit:range);
92}
93
94pub fn av1_iflipadst4(input: &[i32], output: &mut [i32], range: usize) {
95 av1_iadst4(input, output, range);
96 output[..4].reverse();
97}
98
99/// # Panics
100///
101/// - If `input` or `output` have fewer than 4 items.
102#[inline(always)]
103pub fn av1_iadst4(input: &[i32], output: &mut [i32], _range: usize) {
104 assert!(input.len() >= 4);
105 assert!(output.len() >= 4);
106
107 let bit = 12;
108
109 let x0 = input[0];
110 let x1 = input[1];
111 let x2 = input[2];
112 let x3 = input[3];
113
114 // stage 1
115 let s0 = SINPI_INV[1] * x0;
116 let s1 = SINPI_INV[2] * x0;
117 let s2 = SINPI_INV[3] * x1;
118 let s3 = SINPI_INV[4] * x2;
119 let s4 = SINPI_INV[1] * x2;
120 let s5 = SINPI_INV[2] * x3;
121 let s6 = SINPI_INV[4] * x3;
122
123 // stage 2
124 let s7 = (x0 - x2) + x3;
125
126 // stage 3
127 let s0 = s0 + s3;
128 let s1 = s1 - s4;
129 let s3 = s2;
130 let s2 = SINPI_INV[3] * s7;
131
132 // stage 4
133 let s0 = s0 + s5;
134 let s1 = s1 - s6;
135
136 // stage 5
137 let x0 = s0 + s3;
138 let x1 = s1 + s3;
139 let x2 = s2;
140 let x3 = s0 + s1;
141
142 // stage 6
143 let x3 = x3 - s3;
144
145 output[0] = round_shift(x0, bit);
146 output[1] = round_shift(x1, bit);
147 output[2] = round_shift(x2, bit);
148 output[3] = round_shift(x3, bit);
149}
150
151pub fn av1_iidentity4(input: &[i32], output: &mut [i32], _range: usize) {
152 outputimpl Iterator[..4]
153 .iter_mut()
154 .zip(input[..4].iter())
155 .for_each(|(outp: &mut i32, inp: &i32)| *outp = round_shift(value:SQRT2 * *inp, bit:12));
156}
157
158/// # Panics
159///
160/// - If `input` or `output` have fewer than 8 items.
161pub fn av1_idct8(input: &[i32], output: &mut [i32], range: usize) {
162 assert!(input.len() >= 8);
163 assert!(output.len() >= 8);
164
165 // call idct4
166 let temp_in = [input[0], input[2], input[4], input[6]];
167 let mut temp_out: [i32; 4] = [0; 4];
168 av1_idct4(&temp_in, &mut temp_out, range);
169
170 // stage 0
171
172 // stage 1
173 let stg1 = [input[1], input[5], input[3], input[7]];
174
175 // stage 2
176 let stg2 = [
177 half_btf(COSPI_INV[56], stg1[0], -COSPI_INV[8], stg1[3], INV_COS_BIT),
178 half_btf(COSPI_INV[24], stg1[1], -COSPI_INV[40], stg1[2], INV_COS_BIT),
179 half_btf(COSPI_INV[40], stg1[1], COSPI_INV[24], stg1[2], INV_COS_BIT),
180 half_btf(COSPI_INV[8], stg1[0], COSPI_INV[56], stg1[3], INV_COS_BIT),
181 ];
182
183 // stage 3
184 let stg3 = [
185 clamp_value(stg2[0] + stg2[1], range),
186 clamp_value(stg2[0] - stg2[1], range),
187 clamp_value(-stg2[2] + stg2[3], range),
188 clamp_value(stg2[2] + stg2[3], range),
189 ];
190
191 // stage 4
192 let stg4 = [
193 stg3[0],
194 half_btf(-COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
195 half_btf(COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT),
196 stg3[3],
197 ];
198
199 // stage 5
200 output[0] = clamp_value(temp_out[0] + stg4[3], range);
201 output[1] = clamp_value(temp_out[1] + stg4[2], range);
202 output[2] = clamp_value(temp_out[2] + stg4[1], range);
203 output[3] = clamp_value(temp_out[3] + stg4[0], range);
204 output[4] = clamp_value(temp_out[3] - stg4[0], range);
205 output[5] = clamp_value(temp_out[2] - stg4[1], range);
206 output[6] = clamp_value(temp_out[1] - stg4[2], range);
207 output[7] = clamp_value(temp_out[0] - stg4[3], range);
208}
209
210pub fn av1_iflipadst8(input: &[i32], output: &mut [i32], range: usize) {
211 av1_iadst8(input, output, range);
212 output[..8].reverse();
213}
214
215/// # Panics
216///
217/// - If `input` or `output` have fewer than 8 items.
218#[inline(always)]
219pub fn av1_iadst8(input: &[i32], output: &mut [i32], range: usize) {
220 assert!(input.len() >= 8);
221 assert!(output.len() >= 8);
222
223 // stage 1
224 let stg1 = [
225 input[7], input[0], input[5], input[2], input[3], input[4], input[1],
226 input[6],
227 ];
228
229 // stage 2
230 let stg2 = [
231 half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[1], INV_COS_BIT),
232 half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[1], INV_COS_BIT),
233 half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[3], INV_COS_BIT),
234 half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[3], INV_COS_BIT),
235 half_btf(COSPI_INV[36], stg1[4], COSPI_INV[28], stg1[5], INV_COS_BIT),
236 half_btf(COSPI_INV[28], stg1[4], -COSPI_INV[36], stg1[5], INV_COS_BIT),
237 half_btf(COSPI_INV[52], stg1[6], COSPI_INV[12], stg1[7], INV_COS_BIT),
238 half_btf(COSPI_INV[12], stg1[6], -COSPI_INV[52], stg1[7], INV_COS_BIT),
239 ];
240
241 // stage 3
242 let stg3 = [
243 clamp_value(stg2[0] + stg2[4], range),
244 clamp_value(stg2[1] + stg2[5], range),
245 clamp_value(stg2[2] + stg2[6], range),
246 clamp_value(stg2[3] + stg2[7], range),
247 clamp_value(stg2[0] - stg2[4], range),
248 clamp_value(stg2[1] - stg2[5], range),
249 clamp_value(stg2[2] - stg2[6], range),
250 clamp_value(stg2[3] - stg2[7], range),
251 ];
252
253 // stage 4
254 let stg4 = [
255 stg3[0],
256 stg3[1],
257 stg3[2],
258 stg3[3],
259 half_btf(COSPI_INV[16], stg3[4], COSPI_INV[48], stg3[5], INV_COS_BIT),
260 half_btf(COSPI_INV[48], stg3[4], -COSPI_INV[16], stg3[5], INV_COS_BIT),
261 half_btf(-COSPI_INV[48], stg3[6], COSPI_INV[16], stg3[7], INV_COS_BIT),
262 half_btf(COSPI_INV[16], stg3[6], COSPI_INV[48], stg3[7], INV_COS_BIT),
263 ];
264
265 // stage 5
266 let stg5 = [
267 clamp_value(stg4[0] + stg4[2], range),
268 clamp_value(stg4[1] + stg4[3], range),
269 clamp_value(stg4[0] - stg4[2], range),
270 clamp_value(stg4[1] - stg4[3], range),
271 clamp_value(stg4[4] + stg4[6], range),
272 clamp_value(stg4[5] + stg4[7], range),
273 clamp_value(stg4[4] - stg4[6], range),
274 clamp_value(stg4[5] - stg4[7], range),
275 ];
276
277 // stage 6
278 let stg6 = [
279 stg5[0],
280 stg5[1],
281 half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[3], INV_COS_BIT),
282 half_btf(COSPI_INV[32], stg5[2], -COSPI_INV[32], stg5[3], INV_COS_BIT),
283 stg5[4],
284 stg5[5],
285 half_btf(COSPI_INV[32], stg5[6], COSPI_INV[32], stg5[7], INV_COS_BIT),
286 half_btf(COSPI_INV[32], stg5[6], -COSPI_INV[32], stg5[7], INV_COS_BIT),
287 ];
288
289 // stage 7
290 output[0] = stg6[0];
291 output[1] = -stg6[4];
292 output[2] = stg6[6];
293 output[3] = -stg6[2];
294 output[4] = stg6[3];
295 output[5] = -stg6[7];
296 output[6] = stg6[5];
297 output[7] = -stg6[1];
298}
299
300pub fn av1_iidentity8(input: &[i32], output: &mut [i32], _range: usize) {
301 outputimpl Iterator[..8]
302 .iter_mut()
303 .zip(input[..8].iter())
304 .for_each(|(outp: &mut i32, inp: &i32)| *outp = 2 * *inp);
305}
306
307fn av1_idct16(input: &[i32], output: &mut [i32], range: usize) {
308 assert!(input.len() >= 16);
309 assert!(output.len() >= 16);
310
311 // call idct8
312 let temp_in = [
313 input[0], input[2], input[4], input[6], input[8], input[10], input[12],
314 input[14],
315 ];
316 let mut temp_out: [i32; 8] = [0; 8];
317 av1_idct8(&temp_in, &mut temp_out, range);
318
319 // stage 1
320 let stg1 = [
321 input[1], input[9], input[5], input[13], input[3], input[11], input[7],
322 input[15],
323 ];
324
325 // stage 2
326 let stg2 = [
327 half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[7], INV_COS_BIT),
328 half_btf(COSPI_INV[28], stg1[1], -COSPI_INV[36], stg1[6], INV_COS_BIT),
329 half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[5], INV_COS_BIT),
330 half_btf(COSPI_INV[12], stg1[3], -COSPI_INV[52], stg1[4], INV_COS_BIT),
331 half_btf(COSPI_INV[52], stg1[3], COSPI_INV[12], stg1[4], INV_COS_BIT),
332 half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[5], INV_COS_BIT),
333 half_btf(COSPI_INV[36], stg1[1], COSPI_INV[28], stg1[6], INV_COS_BIT),
334 half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[7], INV_COS_BIT),
335 ];
336
337 // stage 3
338 let stg3 = [
339 clamp_value(stg2[0] + stg2[1], range),
340 clamp_value(stg2[0] - stg2[1], range),
341 clamp_value(-stg2[2] + stg2[3], range),
342 clamp_value(stg2[2] + stg2[3], range),
343 clamp_value(stg2[4] + stg2[5], range),
344 clamp_value(stg2[4] - stg2[5], range),
345 clamp_value(-stg2[6] + stg2[7], range),
346 clamp_value(stg2[6] + stg2[7], range),
347 ];
348
349 // stage 4
350 let stg4 = [
351 stg3[0],
352 half_btf(-COSPI_INV[16], stg3[1], COSPI_INV[48], stg3[6], INV_COS_BIT),
353 half_btf(-COSPI_INV[48], stg3[2], -COSPI_INV[16], stg3[5], INV_COS_BIT),
354 stg3[3],
355 stg3[4],
356 half_btf(-COSPI_INV[16], stg3[2], COSPI_INV[48], stg3[5], INV_COS_BIT),
357 half_btf(COSPI_INV[48], stg3[1], COSPI_INV[16], stg3[6], INV_COS_BIT),
358 stg3[7],
359 ];
360
361 // stage 5
362 let stg5 = [
363 clamp_value(stg4[0] + stg4[3], range),
364 clamp_value(stg4[1] + stg4[2], range),
365 clamp_value(stg4[1] - stg4[2], range),
366 clamp_value(stg4[0] - stg4[3], range),
367 clamp_value(-stg4[4] + stg4[7], range),
368 clamp_value(-stg4[5] + stg4[6], range),
369 clamp_value(stg4[5] + stg4[6], range),
370 clamp_value(stg4[4] + stg4[7], range),
371 ];
372
373 // stage 6
374 let stg6 = [
375 stg5[0],
376 stg5[1],
377 half_btf(-COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
378 half_btf(-COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
379 half_btf(COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT),
380 half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT),
381 stg5[6],
382 stg5[7],
383 ];
384
385 // stage 7
386 output[0] = clamp_value(temp_out[0] + stg6[7], range);
387 output[1] = clamp_value(temp_out[1] + stg6[6], range);
388 output[2] = clamp_value(temp_out[2] + stg6[5], range);
389 output[3] = clamp_value(temp_out[3] + stg6[4], range);
390 output[4] = clamp_value(temp_out[4] + stg6[3], range);
391 output[5] = clamp_value(temp_out[5] + stg6[2], range);
392 output[6] = clamp_value(temp_out[6] + stg6[1], range);
393 output[7] = clamp_value(temp_out[7] + stg6[0], range);
394 output[8] = clamp_value(temp_out[7] - stg6[0], range);
395 output[9] = clamp_value(temp_out[6] - stg6[1], range);
396 output[10] = clamp_value(temp_out[5] - stg6[2], range);
397 output[11] = clamp_value(temp_out[4] - stg6[3], range);
398 output[12] = clamp_value(temp_out[3] - stg6[4], range);
399 output[13] = clamp_value(temp_out[2] - stg6[5], range);
400 output[14] = clamp_value(temp_out[1] - stg6[6], range);
401 output[15] = clamp_value(temp_out[0] - stg6[7], range);
402}
403
404pub fn av1_iflipadst16(input: &[i32], output: &mut [i32], range: usize) {
405 av1_iadst16(input, output, range);
406 output[..16].reverse();
407}
408
409#[inline(always)]
410fn av1_iadst16(input: &[i32], output: &mut [i32], range: usize) {
411 assert!(input.len() >= 16);
412 assert!(output.len() >= 16);
413
414 // stage 1
415 let stg1 = [
416 input[15], input[0], input[13], input[2], input[11], input[4], input[9],
417 input[6], input[7], input[8], input[5], input[10], input[3], input[12],
418 input[1], input[14],
419 ];
420
421 // stage 2
422 let stg2 = [
423 half_btf(COSPI_INV[2], stg1[0], COSPI_INV[62], stg1[1], INV_COS_BIT),
424 half_btf(COSPI_INV[62], stg1[0], -COSPI_INV[2], stg1[1], INV_COS_BIT),
425 half_btf(COSPI_INV[10], stg1[2], COSPI_INV[54], stg1[3], INV_COS_BIT),
426 half_btf(COSPI_INV[54], stg1[2], -COSPI_INV[10], stg1[3], INV_COS_BIT),
427 half_btf(COSPI_INV[18], stg1[4], COSPI_INV[46], stg1[5], INV_COS_BIT),
428 half_btf(COSPI_INV[46], stg1[4], -COSPI_INV[18], stg1[5], INV_COS_BIT),
429 half_btf(COSPI_INV[26], stg1[6], COSPI_INV[38], stg1[7], INV_COS_BIT),
430 half_btf(COSPI_INV[38], stg1[6], -COSPI_INV[26], stg1[7], INV_COS_BIT),
431 half_btf(COSPI_INV[34], stg1[8], COSPI_INV[30], stg1[9], INV_COS_BIT),
432 half_btf(COSPI_INV[30], stg1[8], -COSPI_INV[34], stg1[9], INV_COS_BIT),
433 half_btf(COSPI_INV[42], stg1[10], COSPI_INV[22], stg1[11], INV_COS_BIT),
434 half_btf(COSPI_INV[22], stg1[10], -COSPI_INV[42], stg1[11], INV_COS_BIT),
435 half_btf(COSPI_INV[50], stg1[12], COSPI_INV[14], stg1[13], INV_COS_BIT),
436 half_btf(COSPI_INV[14], stg1[12], -COSPI_INV[50], stg1[13], INV_COS_BIT),
437 half_btf(COSPI_INV[58], stg1[14], COSPI_INV[6], stg1[15], INV_COS_BIT),
438 half_btf(COSPI_INV[6], stg1[14], -COSPI_INV[58], stg1[15], INV_COS_BIT),
439 ];
440
441 // stage 3
442 let stg3 = [
443 clamp_value(stg2[0] + stg2[8], range),
444 clamp_value(stg2[1] + stg2[9], range),
445 clamp_value(stg2[2] + stg2[10], range),
446 clamp_value(stg2[3] + stg2[11], range),
447 clamp_value(stg2[4] + stg2[12], range),
448 clamp_value(stg2[5] + stg2[13], range),
449 clamp_value(stg2[6] + stg2[14], range),
450 clamp_value(stg2[7] + stg2[15], range),
451 clamp_value(stg2[0] - stg2[8], range),
452 clamp_value(stg2[1] - stg2[9], range),
453 clamp_value(stg2[2] - stg2[10], range),
454 clamp_value(stg2[3] - stg2[11], range),
455 clamp_value(stg2[4] - stg2[12], range),
456 clamp_value(stg2[5] - stg2[13], range),
457 clamp_value(stg2[6] - stg2[14], range),
458 clamp_value(stg2[7] - stg2[15], range),
459 ];
460
461 // stage 4
462 let stg4 = [
463 stg3[0],
464 stg3[1],
465 stg3[2],
466 stg3[3],
467 stg3[4],
468 stg3[5],
469 stg3[6],
470 stg3[7],
471 half_btf(COSPI_INV[8], stg3[8], COSPI_INV[56], stg3[9], INV_COS_BIT),
472 half_btf(COSPI_INV[56], stg3[8], -COSPI_INV[8], stg3[9], INV_COS_BIT),
473 half_btf(COSPI_INV[40], stg3[10], COSPI_INV[24], stg3[11], INV_COS_BIT),
474 half_btf(COSPI_INV[24], stg3[10], -COSPI_INV[40], stg3[11], INV_COS_BIT),
475 half_btf(-COSPI_INV[56], stg3[12], COSPI_INV[8], stg3[13], INV_COS_BIT),
476 half_btf(COSPI_INV[8], stg3[12], COSPI_INV[56], stg3[13], INV_COS_BIT),
477 half_btf(-COSPI_INV[24], stg3[14], COSPI_INV[40], stg3[15], INV_COS_BIT),
478 half_btf(COSPI_INV[40], stg3[14], COSPI_INV[24], stg3[15], INV_COS_BIT),
479 ];
480
481 // stage 5
482 let stg5 = [
483 clamp_value(stg4[0] + stg4[4], range),
484 clamp_value(stg4[1] + stg4[5], range),
485 clamp_value(stg4[2] + stg4[6], range),
486 clamp_value(stg4[3] + stg4[7], range),
487 clamp_value(stg4[0] - stg4[4], range),
488 clamp_value(stg4[1] - stg4[5], range),
489 clamp_value(stg4[2] - stg4[6], range),
490 clamp_value(stg4[3] - stg4[7], range),
491 clamp_value(stg4[8] + stg4[12], range),
492 clamp_value(stg4[9] + stg4[13], range),
493 clamp_value(stg4[10] + stg4[14], range),
494 clamp_value(stg4[11] + stg4[15], range),
495 clamp_value(stg4[8] - stg4[12], range),
496 clamp_value(stg4[9] - stg4[13], range),
497 clamp_value(stg4[10] - stg4[14], range),
498 clamp_value(stg4[11] - stg4[15], range),
499 ];
500
501 // stage 6
502 let stg6 = [
503 stg5[0],
504 stg5[1],
505 stg5[2],
506 stg5[3],
507 half_btf(COSPI_INV[16], stg5[4], COSPI_INV[48], stg5[5], INV_COS_BIT),
508 half_btf(COSPI_INV[48], stg5[4], -COSPI_INV[16], stg5[5], INV_COS_BIT),
509 half_btf(-COSPI_INV[48], stg5[6], COSPI_INV[16], stg5[7], INV_COS_BIT),
510 half_btf(COSPI_INV[16], stg5[6], COSPI_INV[48], stg5[7], INV_COS_BIT),
511 stg5[8],
512 stg5[9],
513 stg5[10],
514 stg5[11],
515 half_btf(COSPI_INV[16], stg5[12], COSPI_INV[48], stg5[13], INV_COS_BIT),
516 half_btf(COSPI_INV[48], stg5[12], -COSPI_INV[16], stg5[13], INV_COS_BIT),
517 half_btf(-COSPI_INV[48], stg5[14], COSPI_INV[16], stg5[15], INV_COS_BIT),
518 half_btf(COSPI_INV[16], stg5[14], COSPI_INV[48], stg5[15], INV_COS_BIT),
519 ];
520
521 // stage 7
522 let stg7 = [
523 clamp_value(stg6[0] + stg6[2], range),
524 clamp_value(stg6[1] + stg6[3], range),
525 clamp_value(stg6[0] - stg6[2], range),
526 clamp_value(stg6[1] - stg6[3], range),
527 clamp_value(stg6[4] + stg6[6], range),
528 clamp_value(stg6[5] + stg6[7], range),
529 clamp_value(stg6[4] - stg6[6], range),
530 clamp_value(stg6[5] - stg6[7], range),
531 clamp_value(stg6[8] + stg6[10], range),
532 clamp_value(stg6[9] + stg6[11], range),
533 clamp_value(stg6[8] - stg6[10], range),
534 clamp_value(stg6[9] - stg6[11], range),
535 clamp_value(stg6[12] + stg6[14], range),
536 clamp_value(stg6[13] + stg6[15], range),
537 clamp_value(stg6[12] - stg6[14], range),
538 clamp_value(stg6[13] - stg6[15], range),
539 ];
540
541 // stage 8
542 let stg8 = [
543 stg7[0],
544 stg7[1],
545 half_btf(COSPI_INV[32], stg7[2], COSPI_INV[32], stg7[3], INV_COS_BIT),
546 half_btf(COSPI_INV[32], stg7[2], -COSPI_INV[32], stg7[3], INV_COS_BIT),
547 stg7[4],
548 stg7[5],
549 half_btf(COSPI_INV[32], stg7[6], COSPI_INV[32], stg7[7], INV_COS_BIT),
550 half_btf(COSPI_INV[32], stg7[6], -COSPI_INV[32], stg7[7], INV_COS_BIT),
551 stg7[8],
552 stg7[9],
553 half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[11], INV_COS_BIT),
554 half_btf(COSPI_INV[32], stg7[10], -COSPI_INV[32], stg7[11], INV_COS_BIT),
555 stg7[12],
556 stg7[13],
557 half_btf(COSPI_INV[32], stg7[14], COSPI_INV[32], stg7[15], INV_COS_BIT),
558 half_btf(COSPI_INV[32], stg7[14], -COSPI_INV[32], stg7[15], INV_COS_BIT),
559 ];
560
561 // stage 9
562 output[0] = stg8[0];
563 output[1] = -stg8[8];
564 output[2] = stg8[12];
565 output[3] = -stg8[4];
566 output[4] = stg8[6];
567 output[5] = -stg8[14];
568 output[6] = stg8[10];
569 output[7] = -stg8[2];
570 output[8] = stg8[3];
571 output[9] = -stg8[11];
572 output[10] = stg8[15];
573 output[11] = -stg8[7];
574 output[12] = stg8[5];
575 output[13] = -stg8[13];
576 output[14] = stg8[9];
577 output[15] = -stg8[1];
578}
579
580fn av1_iidentity16(input: &[i32], output: &mut [i32], _range: usize) {
581 outputimpl Iterator[..16]
582 .iter_mut()
583 .zip(input[..16].iter())
584 .for_each(|(outp: &mut i32, inp: &i32)| *outp = round_shift(value:SQRT2 * 2 * *inp, bit:12));
585}
586
587fn av1_idct32(input: &[i32], output: &mut [i32], range: usize) {
588 assert!(input.len() >= 32);
589 assert!(output.len() >= 32);
590
591 // stage 1;
592 let stg1 = [
593 input[0], input[16], input[8], input[24], input[4], input[20], input[12],
594 input[28], input[2], input[18], input[10], input[26], input[6], input[22],
595 input[14], input[30], input[1], input[17], input[9], input[25], input[5],
596 input[21], input[13], input[29], input[3], input[19], input[11],
597 input[27], input[7], input[23], input[15], input[31],
598 ];
599
600 // stage 2
601 let stg2 = [
602 stg1[0],
603 stg1[1],
604 stg1[2],
605 stg1[3],
606 stg1[4],
607 stg1[5],
608 stg1[6],
609 stg1[7],
610 stg1[8],
611 stg1[9],
612 stg1[10],
613 stg1[11],
614 stg1[12],
615 stg1[13],
616 stg1[14],
617 stg1[15],
618 half_btf(COSPI_INV[62], stg1[16], -COSPI_INV[2], stg1[31], INV_COS_BIT),
619 half_btf(COSPI_INV[30], stg1[17], -COSPI_INV[34], stg1[30], INV_COS_BIT),
620 half_btf(COSPI_INV[46], stg1[18], -COSPI_INV[18], stg1[29], INV_COS_BIT),
621 half_btf(COSPI_INV[14], stg1[19], -COSPI_INV[50], stg1[28], INV_COS_BIT),
622 half_btf(COSPI_INV[54], stg1[20], -COSPI_INV[10], stg1[27], INV_COS_BIT),
623 half_btf(COSPI_INV[22], stg1[21], -COSPI_INV[42], stg1[26], INV_COS_BIT),
624 half_btf(COSPI_INV[38], stg1[22], -COSPI_INV[26], stg1[25], INV_COS_BIT),
625 half_btf(COSPI_INV[6], stg1[23], -COSPI_INV[58], stg1[24], INV_COS_BIT),
626 half_btf(COSPI_INV[58], stg1[23], COSPI_INV[6], stg1[24], INV_COS_BIT),
627 half_btf(COSPI_INV[26], stg1[22], COSPI_INV[38], stg1[25], INV_COS_BIT),
628 half_btf(COSPI_INV[42], stg1[21], COSPI_INV[22], stg1[26], INV_COS_BIT),
629 half_btf(COSPI_INV[10], stg1[20], COSPI_INV[54], stg1[27], INV_COS_BIT),
630 half_btf(COSPI_INV[50], stg1[19], COSPI_INV[14], stg1[28], INV_COS_BIT),
631 half_btf(COSPI_INV[18], stg1[18], COSPI_INV[46], stg1[29], INV_COS_BIT),
632 half_btf(COSPI_INV[34], stg1[17], COSPI_INV[30], stg1[30], INV_COS_BIT),
633 half_btf(COSPI_INV[2], stg1[16], COSPI_INV[62], stg1[31], INV_COS_BIT),
634 ];
635
636 // stage 3
637 let stg3 = [
638 stg2[0],
639 stg2[1],
640 stg2[2],
641 stg2[3],
642 stg2[4],
643 stg2[5],
644 stg2[6],
645 stg2[7],
646 half_btf(COSPI_INV[60], stg2[8], -COSPI_INV[4], stg2[15], INV_COS_BIT),
647 half_btf(COSPI_INV[28], stg2[9], -COSPI_INV[36], stg2[14], INV_COS_BIT),
648 half_btf(COSPI_INV[44], stg2[10], -COSPI_INV[20], stg2[13], INV_COS_BIT),
649 half_btf(COSPI_INV[12], stg2[11], -COSPI_INV[52], stg2[12], INV_COS_BIT),
650 half_btf(COSPI_INV[52], stg2[11], COSPI_INV[12], stg2[12], INV_COS_BIT),
651 half_btf(COSPI_INV[20], stg2[10], COSPI_INV[44], stg2[13], INV_COS_BIT),
652 half_btf(COSPI_INV[36], stg2[9], COSPI_INV[28], stg2[14], INV_COS_BIT),
653 half_btf(COSPI_INV[4], stg2[8], COSPI_INV[60], stg2[15], INV_COS_BIT),
654 clamp_value(stg2[16] + stg2[17], range),
655 clamp_value(stg2[16] - stg2[17], range),
656 clamp_value(-stg2[18] + stg2[19], range),
657 clamp_value(stg2[18] + stg2[19], range),
658 clamp_value(stg2[20] + stg2[21], range),
659 clamp_value(stg2[20] - stg2[21], range),
660 clamp_value(-stg2[22] + stg2[23], range),
661 clamp_value(stg2[22] + stg2[23], range),
662 clamp_value(stg2[24] + stg2[25], range),
663 clamp_value(stg2[24] - stg2[25], range),
664 clamp_value(-stg2[26] + stg2[27], range),
665 clamp_value(stg2[26] + stg2[27], range),
666 clamp_value(stg2[28] + stg2[29], range),
667 clamp_value(stg2[28] - stg2[29], range),
668 clamp_value(-stg2[30] + stg2[31], range),
669 clamp_value(stg2[30] + stg2[31], range),
670 ];
671
672 // stage 4
673 let stg4 = [
674 stg3[0],
675 stg3[1],
676 stg3[2],
677 stg3[3],
678 half_btf(COSPI_INV[56], stg3[4], -COSPI_INV[8], stg3[7], INV_COS_BIT),
679 half_btf(COSPI_INV[24], stg3[5], -COSPI_INV[40], stg3[6], INV_COS_BIT),
680 half_btf(COSPI_INV[40], stg3[5], COSPI_INV[24], stg3[6], INV_COS_BIT),
681 half_btf(COSPI_INV[8], stg3[4], COSPI_INV[56], stg3[7], INV_COS_BIT),
682 clamp_value(stg3[8] + stg3[9], range),
683 clamp_value(stg3[8] - stg3[9], range),
684 clamp_value(-stg3[10] + stg3[11], range),
685 clamp_value(stg3[10] + stg3[11], range),
686 clamp_value(stg3[12] + stg3[13], range),
687 clamp_value(stg3[12] - stg3[13], range),
688 clamp_value(-stg3[14] + stg3[15], range),
689 clamp_value(stg3[14] + stg3[15], range),
690 stg3[16],
691 half_btf(-COSPI_INV[8], stg3[17], COSPI_INV[56], stg3[30], INV_COS_BIT),
692 half_btf(-COSPI_INV[56], stg3[18], -COSPI_INV[8], stg3[29], INV_COS_BIT),
693 stg3[19],
694 stg3[20],
695 half_btf(-COSPI_INV[40], stg3[21], COSPI_INV[24], stg3[26], INV_COS_BIT),
696 half_btf(-COSPI_INV[24], stg3[22], -COSPI_INV[40], stg3[25], INV_COS_BIT),
697 stg3[23],
698 stg3[24],
699 half_btf(-COSPI_INV[40], stg3[22], COSPI_INV[24], stg3[25], INV_COS_BIT),
700 half_btf(COSPI_INV[24], stg3[21], COSPI_INV[40], stg3[26], INV_COS_BIT),
701 stg3[27],
702 stg3[28],
703 half_btf(-COSPI_INV[8], stg3[18], COSPI_INV[56], stg3[29], INV_COS_BIT),
704 half_btf(COSPI_INV[56], stg3[17], COSPI_INV[8], stg3[30], INV_COS_BIT),
705 stg3[31],
706 ];
707
708 // stage 5
709 let stg5 = [
710 half_btf(COSPI_INV[32], stg4[0], COSPI_INV[32], stg4[1], INV_COS_BIT),
711 half_btf(COSPI_INV[32], stg4[0], -COSPI_INV[32], stg4[1], INV_COS_BIT),
712 half_btf(COSPI_INV[48], stg4[2], -COSPI_INV[16], stg4[3], INV_COS_BIT),
713 half_btf(COSPI_INV[16], stg4[2], COSPI_INV[48], stg4[3], INV_COS_BIT),
714 clamp_value(stg4[4] + stg4[5], range),
715 clamp_value(stg4[4] - stg4[5], range),
716 clamp_value(-stg4[6] + stg4[7], range),
717 clamp_value(stg4[6] + stg4[7], range),
718 stg4[8],
719 half_btf(-COSPI_INV[16], stg4[9], COSPI_INV[48], stg4[14], INV_COS_BIT),
720 half_btf(-COSPI_INV[48], stg4[10], -COSPI_INV[16], stg4[13], INV_COS_BIT),
721 stg4[11],
722 stg4[12],
723 half_btf(-COSPI_INV[16], stg4[10], COSPI_INV[48], stg4[13], INV_COS_BIT),
724 half_btf(COSPI_INV[48], stg4[9], COSPI_INV[16], stg4[14], INV_COS_BIT),
725 stg4[15],
726 clamp_value(stg4[16] + stg4[19], range),
727 clamp_value(stg4[17] + stg4[18], range),
728 clamp_value(stg4[17] - stg4[18], range),
729 clamp_value(stg4[16] - stg4[19], range),
730 clamp_value(-stg4[20] + stg4[23], range),
731 clamp_value(-stg4[21] + stg4[22], range),
732 clamp_value(stg4[21] + stg4[22], range),
733 clamp_value(stg4[20] + stg4[23], range),
734 clamp_value(stg4[24] + stg4[27], range),
735 clamp_value(stg4[25] + stg4[26], range),
736 clamp_value(stg4[25] - stg4[26], range),
737 clamp_value(stg4[24] - stg4[27], range),
738 clamp_value(-stg4[28] + stg4[31], range),
739 clamp_value(-stg4[29] + stg4[30], range),
740 clamp_value(stg4[29] + stg4[30], range),
741 clamp_value(stg4[28] + stg4[31], range),
742 ];
743
744 // stage 6
745 let stg6 = [
746 clamp_value(stg5[0] + stg5[3], range),
747 clamp_value(stg5[1] + stg5[2], range),
748 clamp_value(stg5[1] - stg5[2], range),
749 clamp_value(stg5[0] - stg5[3], range),
750 stg5[4],
751 half_btf(-COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
752 half_btf(COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT),
753 stg5[7],
754 clamp_value(stg5[8] + stg5[11], range),
755 clamp_value(stg5[9] + stg5[10], range),
756 clamp_value(stg5[9] - stg5[10], range),
757 clamp_value(stg5[8] - stg5[11], range),
758 clamp_value(-stg5[12] + stg5[15], range),
759 clamp_value(-stg5[13] + stg5[14], range),
760 clamp_value(stg5[13] + stg5[14], range),
761 clamp_value(stg5[12] + stg5[15], range),
762 stg5[16],
763 stg5[17],
764 half_btf(-COSPI_INV[16], stg5[18], COSPI_INV[48], stg5[29], INV_COS_BIT),
765 half_btf(-COSPI_INV[16], stg5[19], COSPI_INV[48], stg5[28], INV_COS_BIT),
766 half_btf(-COSPI_INV[48], stg5[20], -COSPI_INV[16], stg5[27], INV_COS_BIT),
767 half_btf(-COSPI_INV[48], stg5[21], -COSPI_INV[16], stg5[26], INV_COS_BIT),
768 stg5[22],
769 stg5[23],
770 stg5[24],
771 stg5[25],
772 half_btf(-COSPI_INV[16], stg5[21], COSPI_INV[48], stg5[26], INV_COS_BIT),
773 half_btf(-COSPI_INV[16], stg5[20], COSPI_INV[48], stg5[27], INV_COS_BIT),
774 half_btf(COSPI_INV[48], stg5[19], COSPI_INV[16], stg5[28], INV_COS_BIT),
775 half_btf(COSPI_INV[48], stg5[18], COSPI_INV[16], stg5[29], INV_COS_BIT),
776 stg5[30],
777 stg5[31],
778 ];
779
780 // stage 7
781 let stg7 = [
782 clamp_value(stg6[0] + stg6[7], range),
783 clamp_value(stg6[1] + stg6[6], range),
784 clamp_value(stg6[2] + stg6[5], range),
785 clamp_value(stg6[3] + stg6[4], range),
786 clamp_value(stg6[3] - stg6[4], range),
787 clamp_value(stg6[2] - stg6[5], range),
788 clamp_value(stg6[1] - stg6[6], range),
789 clamp_value(stg6[0] - stg6[7], range),
790 stg6[8],
791 stg6[9],
792 half_btf(-COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
793 half_btf(-COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
794 half_btf(COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT),
795 half_btf(COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT),
796 stg6[14],
797 stg6[15],
798 clamp_value(stg6[16] + stg6[23], range),
799 clamp_value(stg6[17] + stg6[22], range),
800 clamp_value(stg6[18] + stg6[21], range),
801 clamp_value(stg6[19] + stg6[20], range),
802 clamp_value(stg6[19] - stg6[20], range),
803 clamp_value(stg6[18] - stg6[21], range),
804 clamp_value(stg6[17] - stg6[22], range),
805 clamp_value(stg6[16] - stg6[23], range),
806 clamp_value(-stg6[24] + stg6[31], range),
807 clamp_value(-stg6[25] + stg6[30], range),
808 clamp_value(-stg6[26] + stg6[29], range),
809 clamp_value(-stg6[27] + stg6[28], range),
810 clamp_value(stg6[27] + stg6[28], range),
811 clamp_value(stg6[26] + stg6[29], range),
812 clamp_value(stg6[25] + stg6[30], range),
813 clamp_value(stg6[24] + stg6[31], range),
814 ];
815
816 // stage 8
817 let stg8 = [
818 clamp_value(stg7[0] + stg7[15], range),
819 clamp_value(stg7[1] + stg7[14], range),
820 clamp_value(stg7[2] + stg7[13], range),
821 clamp_value(stg7[3] + stg7[12], range),
822 clamp_value(stg7[4] + stg7[11], range),
823 clamp_value(stg7[5] + stg7[10], range),
824 clamp_value(stg7[6] + stg7[9], range),
825 clamp_value(stg7[7] + stg7[8], range),
826 clamp_value(stg7[7] - stg7[8], range),
827 clamp_value(stg7[6] - stg7[9], range),
828 clamp_value(stg7[5] - stg7[10], range),
829 clamp_value(stg7[4] - stg7[11], range),
830 clamp_value(stg7[3] - stg7[12], range),
831 clamp_value(stg7[2] - stg7[13], range),
832 clamp_value(stg7[1] - stg7[14], range),
833 clamp_value(stg7[0] - stg7[15], range),
834 stg7[16],
835 stg7[17],
836 stg7[18],
837 stg7[19],
838 half_btf(-COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
839 half_btf(-COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
840 half_btf(-COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
841 half_btf(-COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
842 half_btf(COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT),
843 half_btf(COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT),
844 half_btf(COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT),
845 half_btf(COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT),
846 stg7[28],
847 stg7[29],
848 stg7[30],
849 stg7[31],
850 ];
851
852 // stage 9
853 output[0] = clamp_value(stg8[0] + stg8[31], range);
854 output[1] = clamp_value(stg8[1] + stg8[30], range);
855 output[2] = clamp_value(stg8[2] + stg8[29], range);
856 output[3] = clamp_value(stg8[3] + stg8[28], range);
857 output[4] = clamp_value(stg8[4] + stg8[27], range);
858 output[5] = clamp_value(stg8[5] + stg8[26], range);
859 output[6] = clamp_value(stg8[6] + stg8[25], range);
860 output[7] = clamp_value(stg8[7] + stg8[24], range);
861 output[8] = clamp_value(stg8[8] + stg8[23], range);
862 output[9] = clamp_value(stg8[9] + stg8[22], range);
863 output[10] = clamp_value(stg8[10] + stg8[21], range);
864 output[11] = clamp_value(stg8[11] + stg8[20], range);
865 output[12] = clamp_value(stg8[12] + stg8[19], range);
866 output[13] = clamp_value(stg8[13] + stg8[18], range);
867 output[14] = clamp_value(stg8[14] + stg8[17], range);
868 output[15] = clamp_value(stg8[15] + stg8[16], range);
869 output[16] = clamp_value(stg8[15] - stg8[16], range);
870 output[17] = clamp_value(stg8[14] - stg8[17], range);
871 output[18] = clamp_value(stg8[13] - stg8[18], range);
872 output[19] = clamp_value(stg8[12] - stg8[19], range);
873 output[20] = clamp_value(stg8[11] - stg8[20], range);
874 output[21] = clamp_value(stg8[10] - stg8[21], range);
875 output[22] = clamp_value(stg8[9] - stg8[22], range);
876 output[23] = clamp_value(stg8[8] - stg8[23], range);
877 output[24] = clamp_value(stg8[7] - stg8[24], range);
878 output[25] = clamp_value(stg8[6] - stg8[25], range);
879 output[26] = clamp_value(stg8[5] - stg8[26], range);
880 output[27] = clamp_value(stg8[4] - stg8[27], range);
881 output[28] = clamp_value(stg8[3] - stg8[28], range);
882 output[29] = clamp_value(stg8[2] - stg8[29], range);
883 output[30] = clamp_value(stg8[1] - stg8[30], range);
884 output[31] = clamp_value(stg8[0] - stg8[31], range);
885}
886
887fn av1_iidentity32(input: &[i32], output: &mut [i32], _range: usize) {
888 outputimpl Iterator[..32]
889 .iter_mut()
890 .zip(input[..32].iter())
891 .for_each(|(outp: &mut i32, inp: &i32)| *outp = 4 * *inp);
892}
893
894fn av1_idct64(input: &[i32], output: &mut [i32], range: usize) {
895 assert!(input.len() >= 64);
896 assert!(output.len() >= 64);
897
898 // stage 1;
899 let stg1 = [
900 input[0], input[32], input[16], input[48], input[8], input[40], input[24],
901 input[56], input[4], input[36], input[20], input[52], input[12],
902 input[44], input[28], input[60], input[2], input[34], input[18],
903 input[50], input[10], input[42], input[26], input[58], input[6],
904 input[38], input[22], input[54], input[14], input[46], input[30],
905 input[62], input[1], input[33], input[17], input[49], input[9], input[41],
906 input[25], input[57], input[5], input[37], input[21], input[53],
907 input[13], input[45], input[29], input[61], input[3], input[35],
908 input[19], input[51], input[11], input[43], input[27], input[59],
909 input[7], input[39], input[23], input[55], input[15], input[47],
910 input[31], input[63],
911 ];
912
913 // stage 2
914 let stg2 = [
915 stg1[0],
916 stg1[1],
917 stg1[2],
918 stg1[3],
919 stg1[4],
920 stg1[5],
921 stg1[6],
922 stg1[7],
923 stg1[8],
924 stg1[9],
925 stg1[10],
926 stg1[11],
927 stg1[12],
928 stg1[13],
929 stg1[14],
930 stg1[15],
931 stg1[16],
932 stg1[17],
933 stg1[18],
934 stg1[19],
935 stg1[20],
936 stg1[21],
937 stg1[22],
938 stg1[23],
939 stg1[24],
940 stg1[25],
941 stg1[26],
942 stg1[27],
943 stg1[28],
944 stg1[29],
945 stg1[30],
946 stg1[31],
947 half_btf(COSPI_INV[63], stg1[32], -COSPI_INV[1], stg1[63], INV_COS_BIT),
948 half_btf(COSPI_INV[31], stg1[33], -COSPI_INV[33], stg1[62], INV_COS_BIT),
949 half_btf(COSPI_INV[47], stg1[34], -COSPI_INV[17], stg1[61], INV_COS_BIT),
950 half_btf(COSPI_INV[15], stg1[35], -COSPI_INV[49], stg1[60], INV_COS_BIT),
951 half_btf(COSPI_INV[55], stg1[36], -COSPI_INV[9], stg1[59], INV_COS_BIT),
952 half_btf(COSPI_INV[23], stg1[37], -COSPI_INV[41], stg1[58], INV_COS_BIT),
953 half_btf(COSPI_INV[39], stg1[38], -COSPI_INV[25], stg1[57], INV_COS_BIT),
954 half_btf(COSPI_INV[7], stg1[39], -COSPI_INV[57], stg1[56], INV_COS_BIT),
955 half_btf(COSPI_INV[59], stg1[40], -COSPI_INV[5], stg1[55], INV_COS_BIT),
956 half_btf(COSPI_INV[27], stg1[41], -COSPI_INV[37], stg1[54], INV_COS_BIT),
957 half_btf(COSPI_INV[43], stg1[42], -COSPI_INV[21], stg1[53], INV_COS_BIT),
958 half_btf(COSPI_INV[11], stg1[43], -COSPI_INV[53], stg1[52], INV_COS_BIT),
959 half_btf(COSPI_INV[51], stg1[44], -COSPI_INV[13], stg1[51], INV_COS_BIT),
960 half_btf(COSPI_INV[19], stg1[45], -COSPI_INV[45], stg1[50], INV_COS_BIT),
961 half_btf(COSPI_INV[35], stg1[46], -COSPI_INV[29], stg1[49], INV_COS_BIT),
962 half_btf(COSPI_INV[3], stg1[47], -COSPI_INV[61], stg1[48], INV_COS_BIT),
963 half_btf(COSPI_INV[61], stg1[47], COSPI_INV[3], stg1[48], INV_COS_BIT),
964 half_btf(COSPI_INV[29], stg1[46], COSPI_INV[35], stg1[49], INV_COS_BIT),
965 half_btf(COSPI_INV[45], stg1[45], COSPI_INV[19], stg1[50], INV_COS_BIT),
966 half_btf(COSPI_INV[13], stg1[44], COSPI_INV[51], stg1[51], INV_COS_BIT),
967 half_btf(COSPI_INV[53], stg1[43], COSPI_INV[11], stg1[52], INV_COS_BIT),
968 half_btf(COSPI_INV[21], stg1[42], COSPI_INV[43], stg1[53], INV_COS_BIT),
969 half_btf(COSPI_INV[37], stg1[41], COSPI_INV[27], stg1[54], INV_COS_BIT),
970 half_btf(COSPI_INV[5], stg1[40], COSPI_INV[59], stg1[55], INV_COS_BIT),
971 half_btf(COSPI_INV[57], stg1[39], COSPI_INV[7], stg1[56], INV_COS_BIT),
972 half_btf(COSPI_INV[25], stg1[38], COSPI_INV[39], stg1[57], INV_COS_BIT),
973 half_btf(COSPI_INV[41], stg1[37], COSPI_INV[23], stg1[58], INV_COS_BIT),
974 half_btf(COSPI_INV[9], stg1[36], COSPI_INV[55], stg1[59], INV_COS_BIT),
975 half_btf(COSPI_INV[49], stg1[35], COSPI_INV[15], stg1[60], INV_COS_BIT),
976 half_btf(COSPI_INV[17], stg1[34], COSPI_INV[47], stg1[61], INV_COS_BIT),
977 half_btf(COSPI_INV[33], stg1[33], COSPI_INV[31], stg1[62], INV_COS_BIT),
978 half_btf(COSPI_INV[1], stg1[32], COSPI_INV[63], stg1[63], INV_COS_BIT),
979 ];
980
981 // stage 3
982 let stg3 = [
983 stg2[0],
984 stg2[1],
985 stg2[2],
986 stg2[3],
987 stg2[4],
988 stg2[5],
989 stg2[6],
990 stg2[7],
991 stg2[8],
992 stg2[9],
993 stg2[10],
994 stg2[11],
995 stg2[12],
996 stg2[13],
997 stg2[14],
998 stg2[15],
999 half_btf(COSPI_INV[62], stg2[16], -COSPI_INV[2], stg2[31], INV_COS_BIT),
1000 half_btf(COSPI_INV[30], stg2[17], -COSPI_INV[34], stg2[30], INV_COS_BIT),
1001 half_btf(COSPI_INV[46], stg2[18], -COSPI_INV[18], stg2[29], INV_COS_BIT),
1002 half_btf(COSPI_INV[14], stg2[19], -COSPI_INV[50], stg2[28], INV_COS_BIT),
1003 half_btf(COSPI_INV[54], stg2[20], -COSPI_INV[10], stg2[27], INV_COS_BIT),
1004 half_btf(COSPI_INV[22], stg2[21], -COSPI_INV[42], stg2[26], INV_COS_BIT),
1005 half_btf(COSPI_INV[38], stg2[22], -COSPI_INV[26], stg2[25], INV_COS_BIT),
1006 half_btf(COSPI_INV[6], stg2[23], -COSPI_INV[58], stg2[24], INV_COS_BIT),
1007 half_btf(COSPI_INV[58], stg2[23], COSPI_INV[6], stg2[24], INV_COS_BIT),
1008 half_btf(COSPI_INV[26], stg2[22], COSPI_INV[38], stg2[25], INV_COS_BIT),
1009 half_btf(COSPI_INV[42], stg2[21], COSPI_INV[22], stg2[26], INV_COS_BIT),
1010 half_btf(COSPI_INV[10], stg2[20], COSPI_INV[54], stg2[27], INV_COS_BIT),
1011 half_btf(COSPI_INV[50], stg2[19], COSPI_INV[14], stg2[28], INV_COS_BIT),
1012 half_btf(COSPI_INV[18], stg2[18], COSPI_INV[46], stg2[29], INV_COS_BIT),
1013 half_btf(COSPI_INV[34], stg2[17], COSPI_INV[30], stg2[30], INV_COS_BIT),
1014 half_btf(COSPI_INV[2], stg2[16], COSPI_INV[62], stg2[31], INV_COS_BIT),
1015 clamp_value(stg2[32] + stg2[33], range),
1016 clamp_value(stg2[32] - stg2[33], range),
1017 clamp_value(-stg2[34] + stg2[35], range),
1018 clamp_value(stg2[34] + stg2[35], range),
1019 clamp_value(stg2[36] + stg2[37], range),
1020 clamp_value(stg2[36] - stg2[37], range),
1021 clamp_value(-stg2[38] + stg2[39], range),
1022 clamp_value(stg2[38] + stg2[39], range),
1023 clamp_value(stg2[40] + stg2[41], range),
1024 clamp_value(stg2[40] - stg2[41], range),
1025 clamp_value(-stg2[42] + stg2[43], range),
1026 clamp_value(stg2[42] + stg2[43], range),
1027 clamp_value(stg2[44] + stg2[45], range),
1028 clamp_value(stg2[44] - stg2[45], range),
1029 clamp_value(-stg2[46] + stg2[47], range),
1030 clamp_value(stg2[46] + stg2[47], range),
1031 clamp_value(stg2[48] + stg2[49], range),
1032 clamp_value(stg2[48] - stg2[49], range),
1033 clamp_value(-stg2[50] + stg2[51], range),
1034 clamp_value(stg2[50] + stg2[51], range),
1035 clamp_value(stg2[52] + stg2[53], range),
1036 clamp_value(stg2[52] - stg2[53], range),
1037 clamp_value(-stg2[54] + stg2[55], range),
1038 clamp_value(stg2[54] + stg2[55], range),
1039 clamp_value(stg2[56] + stg2[57], range),
1040 clamp_value(stg2[56] - stg2[57], range),
1041 clamp_value(-stg2[58] + stg2[59], range),
1042 clamp_value(stg2[58] + stg2[59], range),
1043 clamp_value(stg2[60] + stg2[61], range),
1044 clamp_value(stg2[60] - stg2[61], range),
1045 clamp_value(-stg2[62] + stg2[63], range),
1046 clamp_value(stg2[62] + stg2[63], range),
1047 ];
1048
1049 // stage 4
1050 let stg4 = [
1051 stg3[0],
1052 stg3[1],
1053 stg3[2],
1054 stg3[3],
1055 stg3[4],
1056 stg3[5],
1057 stg3[6],
1058 stg3[7],
1059 half_btf(COSPI_INV[60], stg3[8], -COSPI_INV[4], stg3[15], INV_COS_BIT),
1060 half_btf(COSPI_INV[28], stg3[9], -COSPI_INV[36], stg3[14], INV_COS_BIT),
1061 half_btf(COSPI_INV[44], stg3[10], -COSPI_INV[20], stg3[13], INV_COS_BIT),
1062 half_btf(COSPI_INV[12], stg3[11], -COSPI_INV[52], stg3[12], INV_COS_BIT),
1063 half_btf(COSPI_INV[52], stg3[11], COSPI_INV[12], stg3[12], INV_COS_BIT),
1064 half_btf(COSPI_INV[20], stg3[10], COSPI_INV[44], stg3[13], INV_COS_BIT),
1065 half_btf(COSPI_INV[36], stg3[9], COSPI_INV[28], stg3[14], INV_COS_BIT),
1066 half_btf(COSPI_INV[4], stg3[8], COSPI_INV[60], stg3[15], INV_COS_BIT),
1067 clamp_value(stg3[16] + stg3[17], range),
1068 clamp_value(stg3[16] - stg3[17], range),
1069 clamp_value(-stg3[18] + stg3[19], range),
1070 clamp_value(stg3[18] + stg3[19], range),
1071 clamp_value(stg3[20] + stg3[21], range),
1072 clamp_value(stg3[20] - stg3[21], range),
1073 clamp_value(-stg3[22] + stg3[23], range),
1074 clamp_value(stg3[22] + stg3[23], range),
1075 clamp_value(stg3[24] + stg3[25], range),
1076 clamp_value(stg3[24] - stg3[25], range),
1077 clamp_value(-stg3[26] + stg3[27], range),
1078 clamp_value(stg3[26] + stg3[27], range),
1079 clamp_value(stg3[28] + stg3[29], range),
1080 clamp_value(stg3[28] - stg3[29], range),
1081 clamp_value(-stg3[30] + stg3[31], range),
1082 clamp_value(stg3[30] + stg3[31], range),
1083 stg3[32],
1084 half_btf(-COSPI_INV[4], stg3[33], COSPI_INV[60], stg3[62], INV_COS_BIT),
1085 half_btf(-COSPI_INV[60], stg3[34], -COSPI_INV[4], stg3[61], INV_COS_BIT),
1086 stg3[35],
1087 stg3[36],
1088 half_btf(-COSPI_INV[36], stg3[37], COSPI_INV[28], stg3[58], INV_COS_BIT),
1089 half_btf(-COSPI_INV[28], stg3[38], -COSPI_INV[36], stg3[57], INV_COS_BIT),
1090 stg3[39],
1091 stg3[40],
1092 half_btf(-COSPI_INV[20], stg3[41], COSPI_INV[44], stg3[54], INV_COS_BIT),
1093 half_btf(-COSPI_INV[44], stg3[42], -COSPI_INV[20], stg3[53], INV_COS_BIT),
1094 stg3[43],
1095 stg3[44],
1096 half_btf(-COSPI_INV[52], stg3[45], COSPI_INV[12], stg3[50], INV_COS_BIT),
1097 half_btf(-COSPI_INV[12], stg3[46], -COSPI_INV[52], stg3[49], INV_COS_BIT),
1098 stg3[47],
1099 stg3[48],
1100 half_btf(-COSPI_INV[52], stg3[46], COSPI_INV[12], stg3[49], INV_COS_BIT),
1101 half_btf(COSPI_INV[12], stg3[45], COSPI_INV[52], stg3[50], INV_COS_BIT),
1102 stg3[51],
1103 stg3[52],
1104 half_btf(-COSPI_INV[20], stg3[42], COSPI_INV[44], stg3[53], INV_COS_BIT),
1105 half_btf(COSPI_INV[44], stg3[41], COSPI_INV[20], stg3[54], INV_COS_BIT),
1106 stg3[55],
1107 stg3[56],
1108 half_btf(-COSPI_INV[36], stg3[38], COSPI_INV[28], stg3[57], INV_COS_BIT),
1109 half_btf(COSPI_INV[28], stg3[37], COSPI_INV[36], stg3[58], INV_COS_BIT),
1110 stg3[59],
1111 stg3[60],
1112 half_btf(-COSPI_INV[4], stg3[34], COSPI_INV[60], stg3[61], INV_COS_BIT),
1113 half_btf(COSPI_INV[60], stg3[33], COSPI_INV[4], stg3[62], INV_COS_BIT),
1114 stg3[63],
1115 ];
1116
1117 // stage 5
1118 let stg5 = [
1119 stg4[0],
1120 stg4[1],
1121 stg4[2],
1122 stg4[3],
1123 half_btf(COSPI_INV[56], stg4[4], -COSPI_INV[8], stg4[7], INV_COS_BIT),
1124 half_btf(COSPI_INV[24], stg4[5], -COSPI_INV[40], stg4[6], INV_COS_BIT),
1125 half_btf(COSPI_INV[40], stg4[5], COSPI_INV[24], stg4[6], INV_COS_BIT),
1126 half_btf(COSPI_INV[8], stg4[4], COSPI_INV[56], stg4[7], INV_COS_BIT),
1127 clamp_value(stg4[8] + stg4[9], range),
1128 clamp_value(stg4[8] - stg4[9], range),
1129 clamp_value(-stg4[10] + stg4[11], range),
1130 clamp_value(stg4[10] + stg4[11], range),
1131 clamp_value(stg4[12] + stg4[13], range),
1132 clamp_value(stg4[12] - stg4[13], range),
1133 clamp_value(-stg4[14] + stg4[15], range),
1134 clamp_value(stg4[14] + stg4[15], range),
1135 stg4[16],
1136 half_btf(-COSPI_INV[8], stg4[17], COSPI_INV[56], stg4[30], INV_COS_BIT),
1137 half_btf(-COSPI_INV[56], stg4[18], -COSPI_INV[8], stg4[29], INV_COS_BIT),
1138 stg4[19],
1139 stg4[20],
1140 half_btf(-COSPI_INV[40], stg4[21], COSPI_INV[24], stg4[26], INV_COS_BIT),
1141 half_btf(-COSPI_INV[24], stg4[22], -COSPI_INV[40], stg4[25], INV_COS_BIT),
1142 stg4[23],
1143 stg4[24],
1144 half_btf(-COSPI_INV[40], stg4[22], COSPI_INV[24], stg4[25], INV_COS_BIT),
1145 half_btf(COSPI_INV[24], stg4[21], COSPI_INV[40], stg4[26], INV_COS_BIT),
1146 stg4[27],
1147 stg4[28],
1148 half_btf(-COSPI_INV[8], stg4[18], COSPI_INV[56], stg4[29], INV_COS_BIT),
1149 half_btf(COSPI_INV[56], stg4[17], COSPI_INV[8], stg4[30], INV_COS_BIT),
1150 stg4[31],
1151 clamp_value(stg4[32] + stg4[35], range),
1152 clamp_value(stg4[33] + stg4[34], range),
1153 clamp_value(stg4[33] - stg4[34], range),
1154 clamp_value(stg4[32] - stg4[35], range),
1155 clamp_value(-stg4[36] + stg4[39], range),
1156 clamp_value(-stg4[37] + stg4[38], range),
1157 clamp_value(stg4[37] + stg4[38], range),
1158 clamp_value(stg4[36] + stg4[39], range),
1159 clamp_value(stg4[40] + stg4[43], range),
1160 clamp_value(stg4[41] + stg4[42], range),
1161 clamp_value(stg4[41] - stg4[42], range),
1162 clamp_value(stg4[40] - stg4[43], range),
1163 clamp_value(-stg4[44] + stg4[47], range),
1164 clamp_value(-stg4[45] + stg4[46], range),
1165 clamp_value(stg4[45] + stg4[46], range),
1166 clamp_value(stg4[44] + stg4[47], range),
1167 clamp_value(stg4[48] + stg4[51], range),
1168 clamp_value(stg4[49] + stg4[50], range),
1169 clamp_value(stg4[49] - stg4[50], range),
1170 clamp_value(stg4[48] - stg4[51], range),
1171 clamp_value(-stg4[52] + stg4[55], range),
1172 clamp_value(-stg4[53] + stg4[54], range),
1173 clamp_value(stg4[53] + stg4[54], range),
1174 clamp_value(stg4[52] + stg4[55], range),
1175 clamp_value(stg4[56] + stg4[59], range),
1176 clamp_value(stg4[57] + stg4[58], range),
1177 clamp_value(stg4[57] - stg4[58], range),
1178 clamp_value(stg4[56] - stg4[59], range),
1179 clamp_value(-stg4[60] + stg4[63], range),
1180 clamp_value(-stg4[61] + stg4[62], range),
1181 clamp_value(stg4[61] + stg4[62], range),
1182 clamp_value(stg4[60] + stg4[63], range),
1183 ];
1184
1185 // stage 6
1186 let stg6 = [
1187 half_btf(COSPI_INV[32], stg5[0], COSPI_INV[32], stg5[1], INV_COS_BIT),
1188 half_btf(COSPI_INV[32], stg5[0], -COSPI_INV[32], stg5[1], INV_COS_BIT),
1189 half_btf(COSPI_INV[48], stg5[2], -COSPI_INV[16], stg5[3], INV_COS_BIT),
1190 half_btf(COSPI_INV[16], stg5[2], COSPI_INV[48], stg5[3], INV_COS_BIT),
1191 clamp_value(stg5[4] + stg5[5], range),
1192 clamp_value(stg5[4] - stg5[5], range),
1193 clamp_value(-stg5[6] + stg5[7], range),
1194 clamp_value(stg5[6] + stg5[7], range),
1195 stg5[8],
1196 half_btf(-COSPI_INV[16], stg5[9], COSPI_INV[48], stg5[14], INV_COS_BIT),
1197 half_btf(-COSPI_INV[48], stg5[10], -COSPI_INV[16], stg5[13], INV_COS_BIT),
1198 stg5[11],
1199 stg5[12],
1200 half_btf(-COSPI_INV[16], stg5[10], COSPI_INV[48], stg5[13], INV_COS_BIT),
1201 half_btf(COSPI_INV[48], stg5[9], COSPI_INV[16], stg5[14], INV_COS_BIT),
1202 stg5[15],
1203 clamp_value(stg5[16] + stg5[19], range),
1204 clamp_value(stg5[17] + stg5[18], range),
1205 clamp_value(stg5[17] - stg5[18], range),
1206 clamp_value(stg5[16] - stg5[19], range),
1207 clamp_value(-stg5[20] + stg5[23], range),
1208 clamp_value(-stg5[21] + stg5[22], range),
1209 clamp_value(stg5[21] + stg5[22], range),
1210 clamp_value(stg5[20] + stg5[23], range),
1211 clamp_value(stg5[24] + stg5[27], range),
1212 clamp_value(stg5[25] + stg5[26], range),
1213 clamp_value(stg5[25] - stg5[26], range),
1214 clamp_value(stg5[24] - stg5[27], range),
1215 clamp_value(-stg5[28] + stg5[31], range),
1216 clamp_value(-stg5[29] + stg5[30], range),
1217 clamp_value(stg5[29] + stg5[30], range),
1218 clamp_value(stg5[28] + stg5[31], range),
1219 stg5[32],
1220 stg5[33],
1221 half_btf(-COSPI_INV[8], stg5[34], COSPI_INV[56], stg5[61], INV_COS_BIT),
1222 half_btf(-COSPI_INV[8], stg5[35], COSPI_INV[56], stg5[60], INV_COS_BIT),
1223 half_btf(-COSPI_INV[56], stg5[36], -COSPI_INV[8], stg5[59], INV_COS_BIT),
1224 half_btf(-COSPI_INV[56], stg5[37], -COSPI_INV[8], stg5[58], INV_COS_BIT),
1225 stg5[38],
1226 stg5[39],
1227 stg5[40],
1228 stg5[41],
1229 half_btf(-COSPI_INV[40], stg5[42], COSPI_INV[24], stg5[53], INV_COS_BIT),
1230 half_btf(-COSPI_INV[40], stg5[43], COSPI_INV[24], stg5[52], INV_COS_BIT),
1231 half_btf(-COSPI_INV[24], stg5[44], -COSPI_INV[40], stg5[51], INV_COS_BIT),
1232 half_btf(-COSPI_INV[24], stg5[45], -COSPI_INV[40], stg5[50], INV_COS_BIT),
1233 stg5[46],
1234 stg5[47],
1235 stg5[48],
1236 stg5[49],
1237 half_btf(-COSPI_INV[40], stg5[45], COSPI_INV[24], stg5[50], INV_COS_BIT),
1238 half_btf(-COSPI_INV[40], stg5[44], COSPI_INV[24], stg5[51], INV_COS_BIT),
1239 half_btf(COSPI_INV[24], stg5[43], COSPI_INV[40], stg5[52], INV_COS_BIT),
1240 half_btf(COSPI_INV[24], stg5[42], COSPI_INV[40], stg5[53], INV_COS_BIT),
1241 stg5[54],
1242 stg5[55],
1243 stg5[56],
1244 stg5[57],
1245 half_btf(-COSPI_INV[8], stg5[37], COSPI_INV[56], stg5[58], INV_COS_BIT),
1246 half_btf(-COSPI_INV[8], stg5[36], COSPI_INV[56], stg5[59], INV_COS_BIT),
1247 half_btf(COSPI_INV[56], stg5[35], COSPI_INV[8], stg5[60], INV_COS_BIT),
1248 half_btf(COSPI_INV[56], stg5[34], COSPI_INV[8], stg5[61], INV_COS_BIT),
1249 stg5[62],
1250 stg5[63],
1251 ];
1252
1253 // stage 7
1254 let stg7 = [
1255 clamp_value(stg6[0] + stg6[3], range),
1256 clamp_value(stg6[1] + stg6[2], range),
1257 clamp_value(stg6[1] - stg6[2], range),
1258 clamp_value(stg6[0] - stg6[3], range),
1259 stg6[4],
1260 half_btf(-COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1261 half_btf(COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT),
1262 stg6[7],
1263 clamp_value(stg6[8] + stg6[11], range),
1264 clamp_value(stg6[9] + stg6[10], range),
1265 clamp_value(stg6[9] - stg6[10], range),
1266 clamp_value(stg6[8] - stg6[11], range),
1267 clamp_value(-stg6[12] + stg6[15], range),
1268 clamp_value(-stg6[13] + stg6[14], range),
1269 clamp_value(stg6[13] + stg6[14], range),
1270 clamp_value(stg6[12] + stg6[15], range),
1271 stg6[16],
1272 stg6[17],
1273 half_btf(-COSPI_INV[16], stg6[18], COSPI_INV[48], stg6[29], INV_COS_BIT),
1274 half_btf(-COSPI_INV[16], stg6[19], COSPI_INV[48], stg6[28], INV_COS_BIT),
1275 half_btf(-COSPI_INV[48], stg6[20], -COSPI_INV[16], stg6[27], INV_COS_BIT),
1276 half_btf(-COSPI_INV[48], stg6[21], -COSPI_INV[16], stg6[26], INV_COS_BIT),
1277 stg6[22],
1278 stg6[23],
1279 stg6[24],
1280 stg6[25],
1281 half_btf(-COSPI_INV[16], stg6[21], COSPI_INV[48], stg6[26], INV_COS_BIT),
1282 half_btf(-COSPI_INV[16], stg6[20], COSPI_INV[48], stg6[27], INV_COS_BIT),
1283 half_btf(COSPI_INV[48], stg6[19], COSPI_INV[16], stg6[28], INV_COS_BIT),
1284 half_btf(COSPI_INV[48], stg6[18], COSPI_INV[16], stg6[29], INV_COS_BIT),
1285 stg6[30],
1286 stg6[31],
1287 clamp_value(stg6[32] + stg6[39], range),
1288 clamp_value(stg6[33] + stg6[38], range),
1289 clamp_value(stg6[34] + stg6[37], range),
1290 clamp_value(stg6[35] + stg6[36], range),
1291 clamp_value(stg6[35] - stg6[36], range),
1292 clamp_value(stg6[34] - stg6[37], range),
1293 clamp_value(stg6[33] - stg6[38], range),
1294 clamp_value(stg6[32] - stg6[39], range),
1295 clamp_value(-stg6[40] + stg6[47], range),
1296 clamp_value(-stg6[41] + stg6[46], range),
1297 clamp_value(-stg6[42] + stg6[45], range),
1298 clamp_value(-stg6[43] + stg6[44], range),
1299 clamp_value(stg6[43] + stg6[44], range),
1300 clamp_value(stg6[42] + stg6[45], range),
1301 clamp_value(stg6[41] + stg6[46], range),
1302 clamp_value(stg6[40] + stg6[47], range),
1303 clamp_value(stg6[48] + stg6[55], range),
1304 clamp_value(stg6[49] + stg6[54], range),
1305 clamp_value(stg6[50] + stg6[53], range),
1306 clamp_value(stg6[51] + stg6[52], range),
1307 clamp_value(stg6[51] - stg6[52], range),
1308 clamp_value(stg6[50] - stg6[53], range),
1309 clamp_value(stg6[49] - stg6[54], range),
1310 clamp_value(stg6[48] - stg6[55], range),
1311 clamp_value(-stg6[56] + stg6[63], range),
1312 clamp_value(-stg6[57] + stg6[62], range),
1313 clamp_value(-stg6[58] + stg6[61], range),
1314 clamp_value(-stg6[59] + stg6[60], range),
1315 clamp_value(stg6[59] + stg6[60], range),
1316 clamp_value(stg6[58] + stg6[61], range),
1317 clamp_value(stg6[57] + stg6[62], range),
1318 clamp_value(stg6[56] + stg6[63], range),
1319 ];
1320
1321 // stage 8
1322 let stg8 = [
1323 clamp_value(stg7[0] + stg7[7], range),
1324 clamp_value(stg7[1] + stg7[6], range),
1325 clamp_value(stg7[2] + stg7[5], range),
1326 clamp_value(stg7[3] + stg7[4], range),
1327 clamp_value(stg7[3] - stg7[4], range),
1328 clamp_value(stg7[2] - stg7[5], range),
1329 clamp_value(stg7[1] - stg7[6], range),
1330 clamp_value(stg7[0] - stg7[7], range),
1331 stg7[8],
1332 stg7[9],
1333 half_btf(-COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1334 half_btf(-COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1335 half_btf(COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT),
1336 half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT),
1337 stg7[14],
1338 stg7[15],
1339 clamp_value(stg7[16] + stg7[23], range),
1340 clamp_value(stg7[17] + stg7[22], range),
1341 clamp_value(stg7[18] + stg7[21], range),
1342 clamp_value(stg7[19] + stg7[20], range),
1343 clamp_value(stg7[19] - stg7[20], range),
1344 clamp_value(stg7[18] - stg7[21], range),
1345 clamp_value(stg7[17] - stg7[22], range),
1346 clamp_value(stg7[16] - stg7[23], range),
1347 clamp_value(-stg7[24] + stg7[31], range),
1348 clamp_value(-stg7[25] + stg7[30], range),
1349 clamp_value(-stg7[26] + stg7[29], range),
1350 clamp_value(-stg7[27] + stg7[28], range),
1351 clamp_value(stg7[27] + stg7[28], range),
1352 clamp_value(stg7[26] + stg7[29], range),
1353 clamp_value(stg7[25] + stg7[30], range),
1354 clamp_value(stg7[24] + stg7[31], range),
1355 stg7[32],
1356 stg7[33],
1357 stg7[34],
1358 stg7[35],
1359 half_btf(-COSPI_INV[16], stg7[36], COSPI_INV[48], stg7[59], INV_COS_BIT),
1360 half_btf(-COSPI_INV[16], stg7[37], COSPI_INV[48], stg7[58], INV_COS_BIT),
1361 half_btf(-COSPI_INV[16], stg7[38], COSPI_INV[48], stg7[57], INV_COS_BIT),
1362 half_btf(-COSPI_INV[16], stg7[39], COSPI_INV[48], stg7[56], INV_COS_BIT),
1363 half_btf(-COSPI_INV[48], stg7[40], -COSPI_INV[16], stg7[55], INV_COS_BIT),
1364 half_btf(-COSPI_INV[48], stg7[41], -COSPI_INV[16], stg7[54], INV_COS_BIT),
1365 half_btf(-COSPI_INV[48], stg7[42], -COSPI_INV[16], stg7[53], INV_COS_BIT),
1366 half_btf(-COSPI_INV[48], stg7[43], -COSPI_INV[16], stg7[52], INV_COS_BIT),
1367 stg7[44],
1368 stg7[45],
1369 stg7[46],
1370 stg7[47],
1371 stg7[48],
1372 stg7[49],
1373 stg7[50],
1374 stg7[51],
1375 half_btf(-COSPI_INV[16], stg7[43], COSPI_INV[48], stg7[52], INV_COS_BIT),
1376 half_btf(-COSPI_INV[16], stg7[42], COSPI_INV[48], stg7[53], INV_COS_BIT),
1377 half_btf(-COSPI_INV[16], stg7[41], COSPI_INV[48], stg7[54], INV_COS_BIT),
1378 half_btf(-COSPI_INV[16], stg7[40], COSPI_INV[48], stg7[55], INV_COS_BIT),
1379 half_btf(COSPI_INV[48], stg7[39], COSPI_INV[16], stg7[56], INV_COS_BIT),
1380 half_btf(COSPI_INV[48], stg7[38], COSPI_INV[16], stg7[57], INV_COS_BIT),
1381 half_btf(COSPI_INV[48], stg7[37], COSPI_INV[16], stg7[58], INV_COS_BIT),
1382 half_btf(COSPI_INV[48], stg7[36], COSPI_INV[16], stg7[59], INV_COS_BIT),
1383 stg7[60],
1384 stg7[61],
1385 stg7[62],
1386 stg7[63],
1387 ];
1388
1389 // stage 9
1390 let stg9 = [
1391 clamp_value(stg8[0] + stg8[15], range),
1392 clamp_value(stg8[1] + stg8[14], range),
1393 clamp_value(stg8[2] + stg8[13], range),
1394 clamp_value(stg8[3] + stg8[12], range),
1395 clamp_value(stg8[4] + stg8[11], range),
1396 clamp_value(stg8[5] + stg8[10], range),
1397 clamp_value(stg8[6] + stg8[9], range),
1398 clamp_value(stg8[7] + stg8[8], range),
1399 clamp_value(stg8[7] - stg8[8], range),
1400 clamp_value(stg8[6] - stg8[9], range),
1401 clamp_value(stg8[5] - stg8[10], range),
1402 clamp_value(stg8[4] - stg8[11], range),
1403 clamp_value(stg8[3] - stg8[12], range),
1404 clamp_value(stg8[2] - stg8[13], range),
1405 clamp_value(stg8[1] - stg8[14], range),
1406 clamp_value(stg8[0] - stg8[15], range),
1407 stg8[16],
1408 stg8[17],
1409 stg8[18],
1410 stg8[19],
1411 half_btf(-COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1412 half_btf(-COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1413 half_btf(-COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1414 half_btf(-COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1415 half_btf(COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT),
1416 half_btf(COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT),
1417 half_btf(COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT),
1418 half_btf(COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT),
1419 stg8[28],
1420 stg8[29],
1421 stg8[30],
1422 stg8[31],
1423 clamp_value(stg8[32] + stg8[47], range),
1424 clamp_value(stg8[33] + stg8[46], range),
1425 clamp_value(stg8[34] + stg8[45], range),
1426 clamp_value(stg8[35] + stg8[44], range),
1427 clamp_value(stg8[36] + stg8[43], range),
1428 clamp_value(stg8[37] + stg8[42], range),
1429 clamp_value(stg8[38] + stg8[41], range),
1430 clamp_value(stg8[39] + stg8[40], range),
1431 clamp_value(stg8[39] - stg8[40], range),
1432 clamp_value(stg8[38] - stg8[41], range),
1433 clamp_value(stg8[37] - stg8[42], range),
1434 clamp_value(stg8[36] - stg8[43], range),
1435 clamp_value(stg8[35] - stg8[44], range),
1436 clamp_value(stg8[34] - stg8[45], range),
1437 clamp_value(stg8[33] - stg8[46], range),
1438 clamp_value(stg8[32] - stg8[47], range),
1439 clamp_value(-stg8[48] + stg8[63], range),
1440 clamp_value(-stg8[49] + stg8[62], range),
1441 clamp_value(-stg8[50] + stg8[61], range),
1442 clamp_value(-stg8[51] + stg8[60], range),
1443 clamp_value(-stg8[52] + stg8[59], range),
1444 clamp_value(-stg8[53] + stg8[58], range),
1445 clamp_value(-stg8[54] + stg8[57], range),
1446 clamp_value(-stg8[55] + stg8[56], range),
1447 clamp_value(stg8[55] + stg8[56], range),
1448 clamp_value(stg8[54] + stg8[57], range),
1449 clamp_value(stg8[53] + stg8[58], range),
1450 clamp_value(stg8[52] + stg8[59], range),
1451 clamp_value(stg8[51] + stg8[60], range),
1452 clamp_value(stg8[50] + stg8[61], range),
1453 clamp_value(stg8[49] + stg8[62], range),
1454 clamp_value(stg8[48] + stg8[63], range),
1455 ];
1456
1457 // stage 10
1458 let stg10 = [
1459 clamp_value(stg9[0] + stg9[31], range),
1460 clamp_value(stg9[1] + stg9[30], range),
1461 clamp_value(stg9[2] + stg9[29], range),
1462 clamp_value(stg9[3] + stg9[28], range),
1463 clamp_value(stg9[4] + stg9[27], range),
1464 clamp_value(stg9[5] + stg9[26], range),
1465 clamp_value(stg9[6] + stg9[25], range),
1466 clamp_value(stg9[7] + stg9[24], range),
1467 clamp_value(stg9[8] + stg9[23], range),
1468 clamp_value(stg9[9] + stg9[22], range),
1469 clamp_value(stg9[10] + stg9[21], range),
1470 clamp_value(stg9[11] + stg9[20], range),
1471 clamp_value(stg9[12] + stg9[19], range),
1472 clamp_value(stg9[13] + stg9[18], range),
1473 clamp_value(stg9[14] + stg9[17], range),
1474 clamp_value(stg9[15] + stg9[16], range),
1475 clamp_value(stg9[15] - stg9[16], range),
1476 clamp_value(stg9[14] - stg9[17], range),
1477 clamp_value(stg9[13] - stg9[18], range),
1478 clamp_value(stg9[12] - stg9[19], range),
1479 clamp_value(stg9[11] - stg9[20], range),
1480 clamp_value(stg9[10] - stg9[21], range),
1481 clamp_value(stg9[9] - stg9[22], range),
1482 clamp_value(stg9[8] - stg9[23], range),
1483 clamp_value(stg9[7] - stg9[24], range),
1484 clamp_value(stg9[6] - stg9[25], range),
1485 clamp_value(stg9[5] - stg9[26], range),
1486 clamp_value(stg9[4] - stg9[27], range),
1487 clamp_value(stg9[3] - stg9[28], range),
1488 clamp_value(stg9[2] - stg9[29], range),
1489 clamp_value(stg9[1] - stg9[30], range),
1490 clamp_value(stg9[0] - stg9[31], range),
1491 stg9[32],
1492 stg9[33],
1493 stg9[34],
1494 stg9[35],
1495 stg9[36],
1496 stg9[37],
1497 stg9[38],
1498 stg9[39],
1499 half_btf(-COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1500 half_btf(-COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1501 half_btf(-COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1502 half_btf(-COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1503 half_btf(-COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1504 half_btf(-COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1505 half_btf(-COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1506 half_btf(-COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1507 half_btf(COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT),
1508 half_btf(COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT),
1509 half_btf(COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT),
1510 half_btf(COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT),
1511 half_btf(COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT),
1512 half_btf(COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT),
1513 half_btf(COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT),
1514 half_btf(COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT),
1515 stg9[56],
1516 stg9[57],
1517 stg9[58],
1518 stg9[59],
1519 stg9[60],
1520 stg9[61],
1521 stg9[62],
1522 stg9[63],
1523 ];
1524
1525 // stage 11
1526 output[0] = clamp_value(stg10[0] + stg10[63], range);
1527 output[1] = clamp_value(stg10[1] + stg10[62], range);
1528 output[2] = clamp_value(stg10[2] + stg10[61], range);
1529 output[3] = clamp_value(stg10[3] + stg10[60], range);
1530 output[4] = clamp_value(stg10[4] + stg10[59], range);
1531 output[5] = clamp_value(stg10[5] + stg10[58], range);
1532 output[6] = clamp_value(stg10[6] + stg10[57], range);
1533 output[7] = clamp_value(stg10[7] + stg10[56], range);
1534 output[8] = clamp_value(stg10[8] + stg10[55], range);
1535 output[9] = clamp_value(stg10[9] + stg10[54], range);
1536 output[10] = clamp_value(stg10[10] + stg10[53], range);
1537 output[11] = clamp_value(stg10[11] + stg10[52], range);
1538 output[12] = clamp_value(stg10[12] + stg10[51], range);
1539 output[13] = clamp_value(stg10[13] + stg10[50], range);
1540 output[14] = clamp_value(stg10[14] + stg10[49], range);
1541 output[15] = clamp_value(stg10[15] + stg10[48], range);
1542 output[16] = clamp_value(stg10[16] + stg10[47], range);
1543 output[17] = clamp_value(stg10[17] + stg10[46], range);
1544 output[18] = clamp_value(stg10[18] + stg10[45], range);
1545 output[19] = clamp_value(stg10[19] + stg10[44], range);
1546 output[20] = clamp_value(stg10[20] + stg10[43], range);
1547 output[21] = clamp_value(stg10[21] + stg10[42], range);
1548 output[22] = clamp_value(stg10[22] + stg10[41], range);
1549 output[23] = clamp_value(stg10[23] + stg10[40], range);
1550 output[24] = clamp_value(stg10[24] + stg10[39], range);
1551 output[25] = clamp_value(stg10[25] + stg10[38], range);
1552 output[26] = clamp_value(stg10[26] + stg10[37], range);
1553 output[27] = clamp_value(stg10[27] + stg10[36], range);
1554 output[28] = clamp_value(stg10[28] + stg10[35], range);
1555 output[29] = clamp_value(stg10[29] + stg10[34], range);
1556 output[30] = clamp_value(stg10[30] + stg10[33], range);
1557 output[31] = clamp_value(stg10[31] + stg10[32], range);
1558 output[32] = clamp_value(stg10[31] - stg10[32], range);
1559 output[33] = clamp_value(stg10[30] - stg10[33], range);
1560 output[34] = clamp_value(stg10[29] - stg10[34], range);
1561 output[35] = clamp_value(stg10[28] - stg10[35], range);
1562 output[36] = clamp_value(stg10[27] - stg10[36], range);
1563 output[37] = clamp_value(stg10[26] - stg10[37], range);
1564 output[38] = clamp_value(stg10[25] - stg10[38], range);
1565 output[39] = clamp_value(stg10[24] - stg10[39], range);
1566 output[40] = clamp_value(stg10[23] - stg10[40], range);
1567 output[41] = clamp_value(stg10[22] - stg10[41], range);
1568 output[42] = clamp_value(stg10[21] - stg10[42], range);
1569 output[43] = clamp_value(stg10[20] - stg10[43], range);
1570 output[44] = clamp_value(stg10[19] - stg10[44], range);
1571 output[45] = clamp_value(stg10[18] - stg10[45], range);
1572 output[46] = clamp_value(stg10[17] - stg10[46], range);
1573 output[47] = clamp_value(stg10[16] - stg10[47], range);
1574 output[48] = clamp_value(stg10[15] - stg10[48], range);
1575 output[49] = clamp_value(stg10[14] - stg10[49], range);
1576 output[50] = clamp_value(stg10[13] - stg10[50], range);
1577 output[51] = clamp_value(stg10[12] - stg10[51], range);
1578 output[52] = clamp_value(stg10[11] - stg10[52], range);
1579 output[53] = clamp_value(stg10[10] - stg10[53], range);
1580 output[54] = clamp_value(stg10[9] - stg10[54], range);
1581 output[55] = clamp_value(stg10[8] - stg10[55], range);
1582 output[56] = clamp_value(stg10[7] - stg10[56], range);
1583 output[57] = clamp_value(stg10[6] - stg10[57], range);
1584 output[58] = clamp_value(stg10[5] - stg10[58], range);
1585 output[59] = clamp_value(stg10[4] - stg10[59], range);
1586 output[60] = clamp_value(stg10[3] - stg10[60], range);
1587 output[61] = clamp_value(stg10[2] - stg10[61], range);
1588 output[62] = clamp_value(stg10[1] - stg10[62], range);
1589 output[63] = clamp_value(stg10[0] - stg10[63], range);
1590}
1591
1592type InvTxfmFn = fn(input: &[i32], output: &mut [i32], range: usize);
1593
1594static INV_TXFM_FNS: [[InvTxfmFn; 5]; 5] = [
1595 [av1_idct4, av1_idct8, av1_idct16, av1_idct32, av1_idct64],
1596 [
1597 av1_iadst4,
1598 av1_iadst8,
1599 av1_iadst16,
1600 |_, _, _| unimplemented!(),
1601 |_, _, _| unimplemented!(),
1602 ],
1603 [
1604 av1_iflipadst4,
1605 av1_iflipadst8,
1606 av1_iflipadst16,
1607 |_, _, _| unimplemented!(),
1608 |_, _, _| unimplemented!(),
1609 ],
1610 [
1611 av1_iidentity4,
1612 av1_iidentity8,
1613 av1_iidentity16,
1614 av1_iidentity32,
1615 |_, _, _| unimplemented!(),
1616 ],
1617 [
1618 av1_iwht4,
1619 |_, _, _| unimplemented!(),
1620 |_, _, _| unimplemented!(),
1621 |_, _, _| unimplemented!(),
1622 |_, _, _| unimplemented!(),
1623 ],
1624];
1625
1626pub(crate) mod rust {
1627 use super::*;
1628 use crate::cpu_features::CpuFeatureLevel;
1629 use crate::util::clamp;
1630
1631 use simd_helpers::cold_for_target_arch;
1632 use std::cmp;
1633
1634 #[cold_for_target_arch("x86_64", "aarch64")]
1635 pub fn inverse_transform_add<T: Pixel>(
1636 input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16,
1637 tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
1638 ) {
1639 let width: usize = tx_size.width();
1640 let height: usize = tx_size.height();
1641
1642 // Only use at most 32 columns and 32 rows of input coefficients.
1643 let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
1644
1645 // For 64 point transforms, rely on the last 32 columns being initialized
1646 // to zero for filling out missing input coeffs.
1647 let mut buffer = vec![0i32; width * height].into_boxed_slice();
1648 let rect_type = get_rect_tx_log_ratio(width, height);
1649 let tx_types_1d = get_1d_tx_types(tx_type);
1650 let lossless = tx_type == TxType::WHT_WHT;
1651
1652 // perform inv txfm on every row
1653 let range = bd + 8;
1654 let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3];
1655 // 64 point transforms only signal 32 coeffs. We only take chunks of 32
1656 // and skip over the last 32 transforms here.
1657 for (r, buffer_slice) in (0..height.min(32)).zip(buffer.chunks_mut(width))
1658 {
1659 // For 64 point transforms, rely on the last 32 elements being
1660 // initialized to zero for filling out the missing coeffs.
1661 let mut temp_in: [i32; 64] = [0; 64];
1662 for (raw, clamped) in input[r..]
1663 .iter()
1664 .map(|a| i32::cast_from(*a))
1665 .step_by(height.min(32))
1666 .zip(temp_in.iter_mut())
1667 {
1668 let val = if rect_type.abs() == 1 {
1669 round_shift(raw * INV_SQRT2, SQRT2_BITS)
1670 } else if lossless {
1671 raw >> 2
1672 } else {
1673 raw
1674 };
1675 *clamped = clamp_value(val, range);
1676 }
1677 txfm_fn(&temp_in, buffer_slice, range);
1678 }
1679
1680 // perform inv txfm on every col
1681 let range = cmp::max(bd + 6, 16);
1682 let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3];
1683 for c in 0..width {
1684 let mut temp_in: [i32; 64] = [0; 64];
1685 let mut temp_out: [i32; 64] = [0; 64];
1686 for (raw, clamped) in
1687 buffer[c..].iter().step_by(width).zip(temp_in.iter_mut())
1688 {
1689 *clamped = clamp_value(
1690 round_shift(*raw, INV_INTERMEDIATE_SHIFTS[tx_size as usize]),
1691 range,
1692 );
1693 }
1694 txfm_fn(&temp_in, &mut temp_out, range);
1695 for (temp, out) in temp_out
1696 .iter()
1697 .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height))
1698 {
1699 let v: i32 = (*out).as_();
1700 let r = if lossless { *temp } else { round_shift(*temp, 4) };
1701 let v = clamp(v + r, 0, (1 << bd) - 1);
1702 *out = T::cast_from(v);
1703 }
1704 }
1705 }
1706
1707 /* From AV1 Spec.
1708 https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process
1709 */
1710 const INV_INTERMEDIATE_SHIFTS: [usize; TxSize::TX_SIZES_ALL] =
1711 [0, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2];
1712}
1713