1 | // SPDX-License-Identifier: LGPL-2.1+ |
2 | /* |
3 | * Copyright 2016 Tom aan de Wiel |
4 | * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. |
5 | * |
6 | * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper: |
7 | * |
8 | * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms, |
9 | * R.D. Brown, 1977 |
10 | */ |
11 | |
12 | #include <linux/string.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/videodev2.h> |
15 | #include "codec-fwht.h" |
16 | |
17 | #define OVERFLOW_BIT BIT(14) |
18 | |
19 | /* |
20 | * Note: bit 0 of the header must always be 0. Otherwise it cannot |
21 | * be guaranteed that the magic 8 byte sequence (see below) can |
22 | * never occur in the rlc output. |
23 | */ |
24 | #define PFRAME_BIT BIT(15) |
25 | #define DUPS_MASK 0x1ffe |
26 | |
27 | #define PBLOCK 0 |
28 | #define IBLOCK 1 |
29 | |
30 | #define ALL_ZEROS 15 |
31 | |
32 | static const uint8_t zigzag[64] = { |
33 | 0, |
34 | 1, 8, |
35 | 2, 9, 16, |
36 | 3, 10, 17, 24, |
37 | 4, 11, 18, 25, 32, |
38 | 5, 12, 19, 26, 33, 40, |
39 | 6, 13, 20, 27, 34, 41, 48, |
40 | 7, 14, 21, 28, 35, 42, 49, 56, |
41 | 15, 22, 29, 36, 43, 50, 57, |
42 | 23, 30, 37, 44, 51, 58, |
43 | 31, 38, 45, 52, 59, |
44 | 39, 46, 53, 60, |
45 | 47, 54, 61, |
46 | 55, 62, |
47 | 63, |
48 | }; |
49 | |
50 | /* |
51 | * noinline_for_stack to work around |
52 | * https://llvm.org/pr38809 |
53 | */ |
54 | static int noinline_for_stack |
55 | rlc(const s16 *in, __be16 *output, int blocktype) |
56 | { |
57 | s16 block[8 * 8]; |
58 | s16 *wp = block; |
59 | int i = 0; |
60 | int x, y; |
61 | int ret = 0; |
62 | |
63 | /* read in block from framebuffer */ |
64 | int lastzero_run = 0; |
65 | int to_encode; |
66 | |
67 | for (y = 0; y < 8; y++) { |
68 | for (x = 0; x < 8; x++) { |
69 | *wp = in[x + y * 8]; |
70 | wp++; |
71 | } |
72 | } |
73 | |
74 | /* keep track of amount of trailing zeros */ |
75 | for (i = 63; i >= 0 && !block[zigzag[i]]; i--) |
76 | lastzero_run++; |
77 | |
78 | *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0); |
79 | ret++; |
80 | |
81 | to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0); |
82 | |
83 | i = 0; |
84 | while (i < to_encode) { |
85 | int cnt = 0; |
86 | int tmp; |
87 | |
88 | /* count leading zeros */ |
89 | while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) { |
90 | cnt++; |
91 | i++; |
92 | if (i == to_encode) { |
93 | cnt--; |
94 | break; |
95 | } |
96 | } |
97 | /* 4 bits for run, 12 for coefficient (quantization by 4) */ |
98 | *output++ = htons((cnt | tmp << 4)); |
99 | i++; |
100 | ret++; |
101 | } |
102 | if (lastzero_run > 14) { |
103 | *output = htons(ALL_ZEROS | 0); |
104 | ret++; |
105 | } |
106 | |
107 | return ret; |
108 | } |
109 | |
110 | /* |
111 | * This function will worst-case increase rlc_in by 65*2 bytes: |
112 | * one s16 value for the header and 8 * 8 coefficients of type s16. |
113 | */ |
114 | static noinline_for_stack u16 |
115 | derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input) |
116 | { |
117 | /* header */ |
118 | const __be16 *input = *rlc_in; |
119 | u16 stat; |
120 | int dec_count = 0; |
121 | s16 block[8 * 8 + 16]; |
122 | s16 *wp = block; |
123 | int i; |
124 | |
125 | if (input > end_of_input) |
126 | return OVERFLOW_BIT; |
127 | stat = ntohs(*input++); |
128 | |
129 | /* |
130 | * Now de-compress, it expands one byte to up to 15 bytes |
131 | * (or fills the remainder of the 64 bytes with zeroes if it |
132 | * is the last byte to expand). |
133 | * |
134 | * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to |
135 | * allow for overflow if the incoming data was malformed. |
136 | */ |
137 | while (dec_count < 8 * 8) { |
138 | s16 in; |
139 | int length; |
140 | int coeff; |
141 | |
142 | if (input > end_of_input) |
143 | return OVERFLOW_BIT; |
144 | in = ntohs(*input++); |
145 | length = in & 0xf; |
146 | coeff = in >> 4; |
147 | |
148 | /* fill remainder with zeros */ |
149 | if (length == 15) { |
150 | for (i = 0; i < 64 - dec_count; i++) |
151 | *wp++ = 0; |
152 | break; |
153 | } |
154 | |
155 | for (i = 0; i < length; i++) |
156 | *wp++ = 0; |
157 | *wp++ = coeff; |
158 | dec_count += length + 1; |
159 | } |
160 | |
161 | wp = block; |
162 | |
163 | for (i = 0; i < 64; i++) { |
164 | int pos = zigzag[i]; |
165 | int y = pos / 8; |
166 | int x = pos % 8; |
167 | |
168 | dwht_out[x + y * 8] = *wp++; |
169 | } |
170 | *rlc_in = input; |
171 | return stat; |
172 | } |
173 | |
174 | static const int quant_table[] = { |
175 | 2, 2, 2, 2, 2, 2, 2, 2, |
176 | 2, 2, 2, 2, 2, 2, 2, 2, |
177 | 2, 2, 2, 2, 2, 2, 2, 3, |
178 | 2, 2, 2, 2, 2, 2, 3, 6, |
179 | 2, 2, 2, 2, 2, 3, 6, 6, |
180 | 2, 2, 2, 2, 3, 6, 6, 6, |
181 | 2, 2, 2, 3, 6, 6, 6, 6, |
182 | 2, 2, 3, 6, 6, 6, 6, 8, |
183 | }; |
184 | |
185 | static const int quant_table_p[] = { |
186 | 3, 3, 3, 3, 3, 3, 3, 3, |
187 | 3, 3, 3, 3, 3, 3, 3, 3, |
188 | 3, 3, 3, 3, 3, 3, 3, 3, |
189 | 3, 3, 3, 3, 3, 3, 3, 6, |
190 | 3, 3, 3, 3, 3, 3, 6, 6, |
191 | 3, 3, 3, 3, 3, 6, 6, 9, |
192 | 3, 3, 3, 3, 6, 6, 9, 9, |
193 | 3, 3, 3, 6, 6, 9, 9, 10, |
194 | }; |
195 | |
196 | static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp) |
197 | { |
198 | const int *quant = quant_table; |
199 | int i, j; |
200 | |
201 | for (j = 0; j < 8; j++) { |
202 | for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { |
203 | *coeff >>= *quant; |
204 | if (*coeff >= -qp && *coeff <= qp) |
205 | *coeff = *de_coeff = 0; |
206 | else |
207 | *de_coeff = *coeff << *quant; |
208 | } |
209 | } |
210 | } |
211 | |
212 | static void dequantize_intra(s16 *coeff) |
213 | { |
214 | const int *quant = quant_table; |
215 | int i, j; |
216 | |
217 | for (j = 0; j < 8; j++) |
218 | for (i = 0; i < 8; i++, quant++, coeff++) |
219 | *coeff <<= *quant; |
220 | } |
221 | |
222 | static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp) |
223 | { |
224 | const int *quant = quant_table_p; |
225 | int i, j; |
226 | |
227 | for (j = 0; j < 8; j++) { |
228 | for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { |
229 | *coeff >>= *quant; |
230 | if (*coeff >= -qp && *coeff <= qp) |
231 | *coeff = *de_coeff = 0; |
232 | else |
233 | *de_coeff = *coeff << *quant; |
234 | } |
235 | } |
236 | } |
237 | |
238 | static void dequantize_inter(s16 *coeff) |
239 | { |
240 | const int *quant = quant_table_p; |
241 | int i, j; |
242 | |
243 | for (j = 0; j < 8; j++) |
244 | for (i = 0; i < 8; i++, quant++, coeff++) |
245 | *coeff <<= *quant; |
246 | } |
247 | |
248 | static void noinline_for_stack fwht(const u8 *block, s16 *output_block, |
249 | unsigned int stride, |
250 | unsigned int input_step, bool intra) |
251 | { |
252 | /* we'll need more than 8 bits for the transformed coefficients */ |
253 | s32 workspace1[8], workspace2[8]; |
254 | const u8 *tmp = block; |
255 | s16 *out = output_block; |
256 | int add = intra ? 256 : 0; |
257 | unsigned int i; |
258 | |
259 | /* stage 1 */ |
260 | for (i = 0; i < 8; i++, tmp += stride, out += 8) { |
261 | switch (input_step) { |
262 | case 1: |
263 | workspace1[0] = tmp[0] + tmp[1] - add; |
264 | workspace1[1] = tmp[0] - tmp[1]; |
265 | |
266 | workspace1[2] = tmp[2] + tmp[3] - add; |
267 | workspace1[3] = tmp[2] - tmp[3]; |
268 | |
269 | workspace1[4] = tmp[4] + tmp[5] - add; |
270 | workspace1[5] = tmp[4] - tmp[5]; |
271 | |
272 | workspace1[6] = tmp[6] + tmp[7] - add; |
273 | workspace1[7] = tmp[6] - tmp[7]; |
274 | break; |
275 | case 2: |
276 | workspace1[0] = tmp[0] + tmp[2] - add; |
277 | workspace1[1] = tmp[0] - tmp[2]; |
278 | |
279 | workspace1[2] = tmp[4] + tmp[6] - add; |
280 | workspace1[3] = tmp[4] - tmp[6]; |
281 | |
282 | workspace1[4] = tmp[8] + tmp[10] - add; |
283 | workspace1[5] = tmp[8] - tmp[10]; |
284 | |
285 | workspace1[6] = tmp[12] + tmp[14] - add; |
286 | workspace1[7] = tmp[12] - tmp[14]; |
287 | break; |
288 | case 3: |
289 | workspace1[0] = tmp[0] + tmp[3] - add; |
290 | workspace1[1] = tmp[0] - tmp[3]; |
291 | |
292 | workspace1[2] = tmp[6] + tmp[9] - add; |
293 | workspace1[3] = tmp[6] - tmp[9]; |
294 | |
295 | workspace1[4] = tmp[12] + tmp[15] - add; |
296 | workspace1[5] = tmp[12] - tmp[15]; |
297 | |
298 | workspace1[6] = tmp[18] + tmp[21] - add; |
299 | workspace1[7] = tmp[18] - tmp[21]; |
300 | break; |
301 | default: |
302 | workspace1[0] = tmp[0] + tmp[4] - add; |
303 | workspace1[1] = tmp[0] - tmp[4]; |
304 | |
305 | workspace1[2] = tmp[8] + tmp[12] - add; |
306 | workspace1[3] = tmp[8] - tmp[12]; |
307 | |
308 | workspace1[4] = tmp[16] + tmp[20] - add; |
309 | workspace1[5] = tmp[16] - tmp[20]; |
310 | |
311 | workspace1[6] = tmp[24] + tmp[28] - add; |
312 | workspace1[7] = tmp[24] - tmp[28]; |
313 | break; |
314 | } |
315 | |
316 | /* stage 2 */ |
317 | workspace2[0] = workspace1[0] + workspace1[2]; |
318 | workspace2[1] = workspace1[0] - workspace1[2]; |
319 | workspace2[2] = workspace1[1] - workspace1[3]; |
320 | workspace2[3] = workspace1[1] + workspace1[3]; |
321 | |
322 | workspace2[4] = workspace1[4] + workspace1[6]; |
323 | workspace2[5] = workspace1[4] - workspace1[6]; |
324 | workspace2[6] = workspace1[5] - workspace1[7]; |
325 | workspace2[7] = workspace1[5] + workspace1[7]; |
326 | |
327 | /* stage 3 */ |
328 | out[0] = workspace2[0] + workspace2[4]; |
329 | out[1] = workspace2[0] - workspace2[4]; |
330 | out[2] = workspace2[1] - workspace2[5]; |
331 | out[3] = workspace2[1] + workspace2[5]; |
332 | out[4] = workspace2[2] + workspace2[6]; |
333 | out[5] = workspace2[2] - workspace2[6]; |
334 | out[6] = workspace2[3] - workspace2[7]; |
335 | out[7] = workspace2[3] + workspace2[7]; |
336 | } |
337 | |
338 | out = output_block; |
339 | |
340 | for (i = 0; i < 8; i++, out++) { |
341 | /* stage 1 */ |
342 | workspace1[0] = out[0] + out[1 * 8]; |
343 | workspace1[1] = out[0] - out[1 * 8]; |
344 | |
345 | workspace1[2] = out[2 * 8] + out[3 * 8]; |
346 | workspace1[3] = out[2 * 8] - out[3 * 8]; |
347 | |
348 | workspace1[4] = out[4 * 8] + out[5 * 8]; |
349 | workspace1[5] = out[4 * 8] - out[5 * 8]; |
350 | |
351 | workspace1[6] = out[6 * 8] + out[7 * 8]; |
352 | workspace1[7] = out[6 * 8] - out[7 * 8]; |
353 | |
354 | /* stage 2 */ |
355 | workspace2[0] = workspace1[0] + workspace1[2]; |
356 | workspace2[1] = workspace1[0] - workspace1[2]; |
357 | workspace2[2] = workspace1[1] - workspace1[3]; |
358 | workspace2[3] = workspace1[1] + workspace1[3]; |
359 | |
360 | workspace2[4] = workspace1[4] + workspace1[6]; |
361 | workspace2[5] = workspace1[4] - workspace1[6]; |
362 | workspace2[6] = workspace1[5] - workspace1[7]; |
363 | workspace2[7] = workspace1[5] + workspace1[7]; |
364 | /* stage 3 */ |
365 | out[0 * 8] = workspace2[0] + workspace2[4]; |
366 | out[1 * 8] = workspace2[0] - workspace2[4]; |
367 | out[2 * 8] = workspace2[1] - workspace2[5]; |
368 | out[3 * 8] = workspace2[1] + workspace2[5]; |
369 | out[4 * 8] = workspace2[2] + workspace2[6]; |
370 | out[5 * 8] = workspace2[2] - workspace2[6]; |
371 | out[6 * 8] = workspace2[3] - workspace2[7]; |
372 | out[7 * 8] = workspace2[3] + workspace2[7]; |
373 | } |
374 | } |
375 | |
376 | /* |
377 | * Not the nicest way of doing it, but P-blocks get twice the range of |
378 | * that of the I-blocks. Therefore we need a type bigger than 8 bits. |
379 | * Furthermore values can be negative... This is just a version that |
380 | * works with 16 signed data |
381 | */ |
382 | static void noinline_for_stack |
383 | fwht16(const s16 *block, s16 *output_block, int stride, int intra) |
384 | { |
385 | /* we'll need more than 8 bits for the transformed coefficients */ |
386 | s32 workspace1[8], workspace2[8]; |
387 | const s16 *tmp = block; |
388 | s16 *out = output_block; |
389 | int i; |
390 | |
391 | for (i = 0; i < 8; i++, tmp += stride, out += 8) { |
392 | /* stage 1 */ |
393 | workspace1[0] = tmp[0] + tmp[1]; |
394 | workspace1[1] = tmp[0] - tmp[1]; |
395 | |
396 | workspace1[2] = tmp[2] + tmp[3]; |
397 | workspace1[3] = tmp[2] - tmp[3]; |
398 | |
399 | workspace1[4] = tmp[4] + tmp[5]; |
400 | workspace1[5] = tmp[4] - tmp[5]; |
401 | |
402 | workspace1[6] = tmp[6] + tmp[7]; |
403 | workspace1[7] = tmp[6] - tmp[7]; |
404 | |
405 | /* stage 2 */ |
406 | workspace2[0] = workspace1[0] + workspace1[2]; |
407 | workspace2[1] = workspace1[0] - workspace1[2]; |
408 | workspace2[2] = workspace1[1] - workspace1[3]; |
409 | workspace2[3] = workspace1[1] + workspace1[3]; |
410 | |
411 | workspace2[4] = workspace1[4] + workspace1[6]; |
412 | workspace2[5] = workspace1[4] - workspace1[6]; |
413 | workspace2[6] = workspace1[5] - workspace1[7]; |
414 | workspace2[7] = workspace1[5] + workspace1[7]; |
415 | |
416 | /* stage 3 */ |
417 | out[0] = workspace2[0] + workspace2[4]; |
418 | out[1] = workspace2[0] - workspace2[4]; |
419 | out[2] = workspace2[1] - workspace2[5]; |
420 | out[3] = workspace2[1] + workspace2[5]; |
421 | out[4] = workspace2[2] + workspace2[6]; |
422 | out[5] = workspace2[2] - workspace2[6]; |
423 | out[6] = workspace2[3] - workspace2[7]; |
424 | out[7] = workspace2[3] + workspace2[7]; |
425 | } |
426 | |
427 | out = output_block; |
428 | |
429 | for (i = 0; i < 8; i++, out++) { |
430 | /* stage 1 */ |
431 | workspace1[0] = out[0] + out[1*8]; |
432 | workspace1[1] = out[0] - out[1*8]; |
433 | |
434 | workspace1[2] = out[2*8] + out[3*8]; |
435 | workspace1[3] = out[2*8] - out[3*8]; |
436 | |
437 | workspace1[4] = out[4*8] + out[5*8]; |
438 | workspace1[5] = out[4*8] - out[5*8]; |
439 | |
440 | workspace1[6] = out[6*8] + out[7*8]; |
441 | workspace1[7] = out[6*8] - out[7*8]; |
442 | |
443 | /* stage 2 */ |
444 | workspace2[0] = workspace1[0] + workspace1[2]; |
445 | workspace2[1] = workspace1[0] - workspace1[2]; |
446 | workspace2[2] = workspace1[1] - workspace1[3]; |
447 | workspace2[3] = workspace1[1] + workspace1[3]; |
448 | |
449 | workspace2[4] = workspace1[4] + workspace1[6]; |
450 | workspace2[5] = workspace1[4] - workspace1[6]; |
451 | workspace2[6] = workspace1[5] - workspace1[7]; |
452 | workspace2[7] = workspace1[5] + workspace1[7]; |
453 | |
454 | /* stage 3 */ |
455 | out[0*8] = workspace2[0] + workspace2[4]; |
456 | out[1*8] = workspace2[0] - workspace2[4]; |
457 | out[2*8] = workspace2[1] - workspace2[5]; |
458 | out[3*8] = workspace2[1] + workspace2[5]; |
459 | out[4*8] = workspace2[2] + workspace2[6]; |
460 | out[5*8] = workspace2[2] - workspace2[6]; |
461 | out[6*8] = workspace2[3] - workspace2[7]; |
462 | out[7*8] = workspace2[3] + workspace2[7]; |
463 | } |
464 | } |
465 | |
466 | static noinline_for_stack void |
467 | ifwht(const s16 *block, s16 *output_block, int intra) |
468 | { |
469 | /* |
470 | * we'll need more than 8 bits for the transformed coefficients |
471 | * use native unit of cpu |
472 | */ |
473 | int workspace1[8], workspace2[8]; |
474 | int inter = intra ? 0 : 1; |
475 | const s16 *tmp = block; |
476 | s16 *out = output_block; |
477 | int i; |
478 | |
479 | for (i = 0; i < 8; i++, tmp += 8, out += 8) { |
480 | /* stage 1 */ |
481 | workspace1[0] = tmp[0] + tmp[1]; |
482 | workspace1[1] = tmp[0] - tmp[1]; |
483 | |
484 | workspace1[2] = tmp[2] + tmp[3]; |
485 | workspace1[3] = tmp[2] - tmp[3]; |
486 | |
487 | workspace1[4] = tmp[4] + tmp[5]; |
488 | workspace1[5] = tmp[4] - tmp[5]; |
489 | |
490 | workspace1[6] = tmp[6] + tmp[7]; |
491 | workspace1[7] = tmp[6] - tmp[7]; |
492 | |
493 | /* stage 2 */ |
494 | workspace2[0] = workspace1[0] + workspace1[2]; |
495 | workspace2[1] = workspace1[0] - workspace1[2]; |
496 | workspace2[2] = workspace1[1] - workspace1[3]; |
497 | workspace2[3] = workspace1[1] + workspace1[3]; |
498 | |
499 | workspace2[4] = workspace1[4] + workspace1[6]; |
500 | workspace2[5] = workspace1[4] - workspace1[6]; |
501 | workspace2[6] = workspace1[5] - workspace1[7]; |
502 | workspace2[7] = workspace1[5] + workspace1[7]; |
503 | |
504 | /* stage 3 */ |
505 | out[0] = workspace2[0] + workspace2[4]; |
506 | out[1] = workspace2[0] - workspace2[4]; |
507 | out[2] = workspace2[1] - workspace2[5]; |
508 | out[3] = workspace2[1] + workspace2[5]; |
509 | out[4] = workspace2[2] + workspace2[6]; |
510 | out[5] = workspace2[2] - workspace2[6]; |
511 | out[6] = workspace2[3] - workspace2[7]; |
512 | out[7] = workspace2[3] + workspace2[7]; |
513 | } |
514 | |
515 | out = output_block; |
516 | |
517 | for (i = 0; i < 8; i++, out++) { |
518 | /* stage 1 */ |
519 | workspace1[0] = out[0] + out[1 * 8]; |
520 | workspace1[1] = out[0] - out[1 * 8]; |
521 | |
522 | workspace1[2] = out[2 * 8] + out[3 * 8]; |
523 | workspace1[3] = out[2 * 8] - out[3 * 8]; |
524 | |
525 | workspace1[4] = out[4 * 8] + out[5 * 8]; |
526 | workspace1[5] = out[4 * 8] - out[5 * 8]; |
527 | |
528 | workspace1[6] = out[6 * 8] + out[7 * 8]; |
529 | workspace1[7] = out[6 * 8] - out[7 * 8]; |
530 | |
531 | /* stage 2 */ |
532 | workspace2[0] = workspace1[0] + workspace1[2]; |
533 | workspace2[1] = workspace1[0] - workspace1[2]; |
534 | workspace2[2] = workspace1[1] - workspace1[3]; |
535 | workspace2[3] = workspace1[1] + workspace1[3]; |
536 | |
537 | workspace2[4] = workspace1[4] + workspace1[6]; |
538 | workspace2[5] = workspace1[4] - workspace1[6]; |
539 | workspace2[6] = workspace1[5] - workspace1[7]; |
540 | workspace2[7] = workspace1[5] + workspace1[7]; |
541 | |
542 | /* stage 3 */ |
543 | if (inter) { |
544 | int d; |
545 | |
546 | out[0 * 8] = workspace2[0] + workspace2[4]; |
547 | out[1 * 8] = workspace2[0] - workspace2[4]; |
548 | out[2 * 8] = workspace2[1] - workspace2[5]; |
549 | out[3 * 8] = workspace2[1] + workspace2[5]; |
550 | out[4 * 8] = workspace2[2] + workspace2[6]; |
551 | out[5 * 8] = workspace2[2] - workspace2[6]; |
552 | out[6 * 8] = workspace2[3] - workspace2[7]; |
553 | out[7 * 8] = workspace2[3] + workspace2[7]; |
554 | |
555 | for (d = 0; d < 8; d++) |
556 | out[8 * d] >>= 6; |
557 | } else { |
558 | int d; |
559 | |
560 | out[0 * 8] = workspace2[0] + workspace2[4]; |
561 | out[1 * 8] = workspace2[0] - workspace2[4]; |
562 | out[2 * 8] = workspace2[1] - workspace2[5]; |
563 | out[3 * 8] = workspace2[1] + workspace2[5]; |
564 | out[4 * 8] = workspace2[2] + workspace2[6]; |
565 | out[5 * 8] = workspace2[2] - workspace2[6]; |
566 | out[6 * 8] = workspace2[3] - workspace2[7]; |
567 | out[7 * 8] = workspace2[3] + workspace2[7]; |
568 | |
569 | for (d = 0; d < 8; d++) { |
570 | out[8 * d] >>= 6; |
571 | out[8 * d] += 128; |
572 | } |
573 | } |
574 | } |
575 | } |
576 | |
577 | static void fill_encoder_block(const u8 *input, s16 *dst, |
578 | unsigned int stride, unsigned int input_step) |
579 | { |
580 | int i, j; |
581 | |
582 | for (i = 0; i < 8; i++) { |
583 | for (j = 0; j < 8; j++, input += input_step) |
584 | *dst++ = *input; |
585 | input += stride - 8 * input_step; |
586 | } |
587 | } |
588 | |
589 | static int var_intra(const s16 *input) |
590 | { |
591 | int32_t mean = 0; |
592 | int32_t ret = 0; |
593 | const s16 *tmp = input; |
594 | int i; |
595 | |
596 | for (i = 0; i < 8 * 8; i++, tmp++) |
597 | mean += *tmp; |
598 | mean /= 64; |
599 | tmp = input; |
600 | for (i = 0; i < 8 * 8; i++, tmp++) |
601 | ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean); |
602 | return ret; |
603 | } |
604 | |
605 | static int var_inter(const s16 *old, const s16 *new) |
606 | { |
607 | int32_t ret = 0; |
608 | int i; |
609 | |
610 | for (i = 0; i < 8 * 8; i++, old++, new++) |
611 | ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new); |
612 | return ret; |
613 | } |
614 | |
615 | static noinline_for_stack int |
616 | decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock, |
617 | unsigned int stride, unsigned int input_step) |
618 | { |
619 | s16 tmp[64]; |
620 | s16 old[64]; |
621 | s16 *work = tmp; |
622 | unsigned int k, l; |
623 | int vari; |
624 | int vard; |
625 | |
626 | fill_encoder_block(input: cur, dst: tmp, stride, input_step); |
627 | fill_encoder_block(input: reference, dst: old, stride: 8, input_step: 1); |
628 | vari = var_intra(input: tmp); |
629 | |
630 | for (k = 0; k < 8; k++) { |
631 | for (l = 0; l < 8; l++) { |
632 | *deltablock = *work - *reference; |
633 | deltablock++; |
634 | work++; |
635 | reference++; |
636 | } |
637 | } |
638 | deltablock -= 64; |
639 | vard = var_inter(old, new: tmp); |
640 | return vari <= vard ? IBLOCK : PBLOCK; |
641 | } |
642 | |
643 | static void fill_decoder_block(u8 *dst, const s16 *input, int stride, |
644 | unsigned int dst_step) |
645 | { |
646 | int i, j; |
647 | |
648 | for (i = 0; i < 8; i++) { |
649 | for (j = 0; j < 8; j++, input++, dst += dst_step) { |
650 | if (*input < 0) |
651 | *dst = 0; |
652 | else if (*input > 255) |
653 | *dst = 255; |
654 | else |
655 | *dst = *input; |
656 | } |
657 | dst += stride - (8 * dst_step); |
658 | } |
659 | } |
660 | |
661 | static void add_deltas(s16 *deltas, const u8 *ref, int stride, |
662 | unsigned int ref_step) |
663 | { |
664 | int k, l; |
665 | |
666 | for (k = 0; k < 8; k++) { |
667 | for (l = 0; l < 8; l++) { |
668 | *deltas += *ref; |
669 | ref += ref_step; |
670 | /* |
671 | * Due to quantizing, it might possible that the |
672 | * decoded coefficients are slightly out of range |
673 | */ |
674 | if (*deltas < 0) |
675 | *deltas = 0; |
676 | else if (*deltas > 255) |
677 | *deltas = 255; |
678 | deltas++; |
679 | } |
680 | ref += stride - (8 * ref_step); |
681 | } |
682 | } |
683 | |
684 | static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max, |
685 | struct fwht_cframe *cf, u32 height, u32 width, |
686 | u32 stride, unsigned int input_step, |
687 | bool is_intra, bool next_is_intra) |
688 | { |
689 | u8 *input_start = input; |
690 | __be16 *rlco_start = *rlco; |
691 | s16 deltablock[64]; |
692 | __be16 pframe_bit = htons(PFRAME_BIT); |
693 | u32 encoding = 0; |
694 | unsigned int last_size = 0; |
695 | unsigned int i, j; |
696 | |
697 | width = round_up(width, 8); |
698 | height = round_up(height, 8); |
699 | |
700 | for (j = 0; j < height / 8; j++) { |
701 | input = input_start + j * 8 * stride; |
702 | for (i = 0; i < width / 8; i++) { |
703 | /* intra code, first frame is always intra coded. */ |
704 | int blocktype = IBLOCK; |
705 | unsigned int size; |
706 | |
707 | if (!is_intra) |
708 | blocktype = decide_blocktype(cur: input, reference: refp, |
709 | deltablock, stride, input_step); |
710 | if (blocktype == IBLOCK) { |
711 | fwht(block: input, output_block: cf->coeffs, stride, input_step, intra: 1); |
712 | quantize_intra(coeff: cf->coeffs, de_coeff: cf->de_coeffs, |
713 | qp: cf->i_frame_qp); |
714 | } else { |
715 | /* inter code */ |
716 | encoding |= FWHT_FRAME_PCODED; |
717 | fwht16(block: deltablock, output_block: cf->coeffs, stride: 8, intra: 0); |
718 | quantize_inter(coeff: cf->coeffs, de_coeff: cf->de_coeffs, |
719 | qp: cf->p_frame_qp); |
720 | } |
721 | if (!next_is_intra) { |
722 | ifwht(block: cf->de_coeffs, output_block: cf->de_fwht, intra: blocktype); |
723 | |
724 | if (blocktype == PBLOCK) |
725 | add_deltas(deltas: cf->de_fwht, ref: refp, stride: 8, ref_step: 1); |
726 | fill_decoder_block(dst: refp, input: cf->de_fwht, stride: 8, dst_step: 1); |
727 | } |
728 | |
729 | input += 8 * input_step; |
730 | refp += 8 * 8; |
731 | |
732 | size = rlc(in: cf->coeffs, output: *rlco, blocktype); |
733 | if (last_size == size && |
734 | !memcmp(p: *rlco + 1, q: *rlco - size + 1, size: 2 * size - 2)) { |
735 | __be16 *last_rlco = *rlco - size; |
736 | s16 hdr = ntohs(*last_rlco); |
737 | |
738 | if (!((*last_rlco ^ **rlco) & pframe_bit) && |
739 | (hdr & DUPS_MASK) < DUPS_MASK) |
740 | *last_rlco = htons(hdr + 2); |
741 | else |
742 | *rlco += size; |
743 | } else { |
744 | *rlco += size; |
745 | } |
746 | if (*rlco >= rlco_max) { |
747 | encoding |= FWHT_FRAME_UNENCODED; |
748 | goto exit_loop; |
749 | } |
750 | last_size = size; |
751 | } |
752 | } |
753 | |
754 | exit_loop: |
755 | if (encoding & FWHT_FRAME_UNENCODED) { |
756 | u8 *out = (u8 *)rlco_start; |
757 | u8 *p; |
758 | |
759 | input = input_start; |
760 | /* |
761 | * The compressed stream should never contain the magic |
762 | * header, so when we copy the YUV data we replace 0xff |
763 | * by 0xfe. Since YUV is limited range such values |
764 | * shouldn't appear anyway. |
765 | */ |
766 | for (j = 0; j < height; j++) { |
767 | for (i = 0, p = input; i < width; i++, p += input_step) |
768 | *out++ = (*p == 0xff) ? 0xfe : *p; |
769 | input += stride; |
770 | } |
771 | *rlco = (__be16 *)out; |
772 | encoding &= ~FWHT_FRAME_PCODED; |
773 | } |
774 | return encoding; |
775 | } |
776 | |
777 | u32 fwht_encode_frame(struct fwht_raw_frame *frm, |
778 | struct fwht_raw_frame *ref_frm, |
779 | struct fwht_cframe *cf, |
780 | bool is_intra, bool next_is_intra, |
781 | unsigned int width, unsigned int height, |
782 | unsigned int stride, unsigned int chroma_stride) |
783 | { |
784 | unsigned int size = height * width; |
785 | __be16 *rlco = cf->rlc_data; |
786 | __be16 *rlco_max; |
787 | u32 encoding; |
788 | |
789 | rlco_max = rlco + size / 2 - 256; |
790 | encoding = encode_plane(input: frm->luma, refp: ref_frm->luma, rlco: &rlco, rlco_max, cf, |
791 | height, width, stride, |
792 | input_step: frm->luma_alpha_step, is_intra, next_is_intra); |
793 | if (encoding & FWHT_FRAME_UNENCODED) |
794 | encoding |= FWHT_LUMA_UNENCODED; |
795 | encoding &= ~FWHT_FRAME_UNENCODED; |
796 | |
797 | if (frm->components_num >= 3) { |
798 | u32 chroma_h = height / frm->height_div; |
799 | u32 chroma_w = width / frm->width_div; |
800 | unsigned int chroma_size = chroma_h * chroma_w; |
801 | |
802 | rlco_max = rlco + chroma_size / 2 - 256; |
803 | encoding |= encode_plane(input: frm->cb, refp: ref_frm->cb, rlco: &rlco, rlco_max, |
804 | cf, height: chroma_h, width: chroma_w, |
805 | stride: chroma_stride, input_step: frm->chroma_step, |
806 | is_intra, next_is_intra); |
807 | if (encoding & FWHT_FRAME_UNENCODED) |
808 | encoding |= FWHT_CB_UNENCODED; |
809 | encoding &= ~FWHT_FRAME_UNENCODED; |
810 | rlco_max = rlco + chroma_size / 2 - 256; |
811 | encoding |= encode_plane(input: frm->cr, refp: ref_frm->cr, rlco: &rlco, rlco_max, |
812 | cf, height: chroma_h, width: chroma_w, |
813 | stride: chroma_stride, input_step: frm->chroma_step, |
814 | is_intra, next_is_intra); |
815 | if (encoding & FWHT_FRAME_UNENCODED) |
816 | encoding |= FWHT_CR_UNENCODED; |
817 | encoding &= ~FWHT_FRAME_UNENCODED; |
818 | } |
819 | |
820 | if (frm->components_num == 4) { |
821 | rlco_max = rlco + size / 2 - 256; |
822 | encoding |= encode_plane(input: frm->alpha, refp: ref_frm->alpha, rlco: &rlco, |
823 | rlco_max, cf, height, width, |
824 | stride, input_step: frm->luma_alpha_step, |
825 | is_intra, next_is_intra); |
826 | if (encoding & FWHT_FRAME_UNENCODED) |
827 | encoding |= FWHT_ALPHA_UNENCODED; |
828 | encoding &= ~FWHT_FRAME_UNENCODED; |
829 | } |
830 | |
831 | cf->size = (rlco - cf->rlc_data) * sizeof(*rlco); |
832 | return encoding; |
833 | } |
834 | |
835 | static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco, |
836 | u32 height, u32 width, const u8 *ref, u32 ref_stride, |
837 | unsigned int ref_step, u8 *dst, |
838 | unsigned int dst_stride, unsigned int dst_step, |
839 | bool uncompressed, const __be16 *end_of_rlco_buf) |
840 | { |
841 | unsigned int copies = 0; |
842 | s16 copy[8 * 8]; |
843 | u16 stat; |
844 | unsigned int i, j; |
845 | bool is_intra = !ref; |
846 | |
847 | width = round_up(width, 8); |
848 | height = round_up(height, 8); |
849 | |
850 | if (uncompressed) { |
851 | int i; |
852 | |
853 | if (end_of_rlco_buf + 1 < *rlco + width * height / 2) |
854 | return false; |
855 | for (i = 0; i < height; i++) { |
856 | memcpy(dst, *rlco, width); |
857 | dst += dst_stride; |
858 | *rlco += width / 2; |
859 | } |
860 | return true; |
861 | } |
862 | |
863 | /* |
864 | * When decoding each macroblock the rlco pointer will be increased |
865 | * by 65 * 2 bytes worst-case. |
866 | * To avoid overflow the buffer has to be 65/64th of the actual raw |
867 | * image size, just in case someone feeds it malicious data. |
868 | */ |
869 | for (j = 0; j < height / 8; j++) { |
870 | for (i = 0; i < width / 8; i++) { |
871 | const u8 *refp = ref + j * 8 * ref_stride + |
872 | i * 8 * ref_step; |
873 | u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step; |
874 | |
875 | if (copies) { |
876 | memcpy(cf->de_fwht, copy, sizeof(copy)); |
877 | if ((stat & PFRAME_BIT) && !is_intra) |
878 | add_deltas(deltas: cf->de_fwht, ref: refp, |
879 | stride: ref_stride, ref_step); |
880 | fill_decoder_block(dst: dstp, input: cf->de_fwht, |
881 | stride: dst_stride, dst_step); |
882 | copies--; |
883 | continue; |
884 | } |
885 | |
886 | stat = derlc(rlc_in: rlco, dwht_out: cf->coeffs, end_of_input: end_of_rlco_buf); |
887 | if (stat & OVERFLOW_BIT) |
888 | return false; |
889 | if ((stat & PFRAME_BIT) && !is_intra) |
890 | dequantize_inter(coeff: cf->coeffs); |
891 | else |
892 | dequantize_intra(coeff: cf->coeffs); |
893 | |
894 | ifwht(block: cf->coeffs, output_block: cf->de_fwht, |
895 | intra: ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1); |
896 | |
897 | copies = (stat & DUPS_MASK) >> 1; |
898 | if (copies) |
899 | memcpy(copy, cf->de_fwht, sizeof(copy)); |
900 | if ((stat & PFRAME_BIT) && !is_intra) |
901 | add_deltas(deltas: cf->de_fwht, ref: refp, |
902 | stride: ref_stride, ref_step); |
903 | fill_decoder_block(dst: dstp, input: cf->de_fwht, stride: dst_stride, |
904 | dst_step); |
905 | } |
906 | } |
907 | return true; |
908 | } |
909 | |
910 | bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags, |
911 | unsigned int components_num, unsigned int width, |
912 | unsigned int height, const struct fwht_raw_frame *ref, |
913 | unsigned int ref_stride, unsigned int ref_chroma_stride, |
914 | struct fwht_raw_frame *dst, unsigned int dst_stride, |
915 | unsigned int dst_chroma_stride) |
916 | { |
917 | const __be16 *rlco = cf->rlc_data; |
918 | const __be16 *end_of_rlco_buf = cf->rlc_data + |
919 | (cf->size / sizeof(*rlco)) - 1; |
920 | |
921 | if (!decode_plane(cf, rlco: &rlco, height, width, ref: ref->luma, ref_stride, |
922 | ref_step: ref->luma_alpha_step, dst: dst->luma, dst_stride, |
923 | dst_step: dst->luma_alpha_step, |
924 | uncompressed: hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED, |
925 | end_of_rlco_buf)) |
926 | return false; |
927 | |
928 | if (components_num >= 3) { |
929 | u32 h = height; |
930 | u32 w = width; |
931 | |
932 | if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT)) |
933 | h /= 2; |
934 | if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH)) |
935 | w /= 2; |
936 | |
937 | if (!decode_plane(cf, rlco: &rlco, height: h, width: w, ref: ref->cb, ref_stride: ref_chroma_stride, |
938 | ref_step: ref->chroma_step, dst: dst->cb, dst_stride: dst_chroma_stride, |
939 | dst_step: dst->chroma_step, |
940 | uncompressed: hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED, |
941 | end_of_rlco_buf)) |
942 | return false; |
943 | if (!decode_plane(cf, rlco: &rlco, height: h, width: w, ref: ref->cr, ref_stride: ref_chroma_stride, |
944 | ref_step: ref->chroma_step, dst: dst->cr, dst_stride: dst_chroma_stride, |
945 | dst_step: dst->chroma_step, |
946 | uncompressed: hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED, |
947 | end_of_rlco_buf)) |
948 | return false; |
949 | } |
950 | |
951 | if (components_num == 4) |
952 | if (!decode_plane(cf, rlco: &rlco, height, width, ref: ref->alpha, ref_stride, |
953 | ref_step: ref->luma_alpha_step, dst: dst->alpha, dst_stride, |
954 | dst_step: dst->luma_alpha_step, |
955 | uncompressed: hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED, |
956 | end_of_rlco_buf)) |
957 | return false; |
958 | return true; |
959 | } |
960 | |