| 1 | // Copyright 2022 Google Inc. All Rights Reserved. | 
| 2 | // | 
| 3 | // Use of this source code is governed by a BSD-style license | 
| 4 | // that can be found in the COPYING file in the root of the source | 
| 5 | // tree. An additional intellectual property rights grant can be found | 
| 6 | // in the file PATENTS. All contributing project authors may | 
| 7 | // be found in the AUTHORS file in the root of the source tree. | 
| 8 | // ----------------------------------------------------------------------------- | 
| 9 | // | 
| 10 | // Speed-critical functions for Sharp YUV. | 
| 11 | // | 
| 12 | // Author: Skal (pascal.massimino@gmail.com) | 
| 13 |  | 
| 14 | #include "sharpyuv/sharpyuv_dsp.h" | 
| 15 |  | 
| 16 | #if defined(WEBP_USE_SSE2) | 
| 17 | #include <stdlib.h> | 
| 18 | #include <emmintrin.h> | 
| 19 |  | 
| 20 | static uint16_t clip_SSE2(int v, int max) { | 
| 21 |   return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; | 
| 22 | } | 
| 23 |  | 
| 24 | static uint64_t SharpYuvUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, | 
| 25 |                                      uint16_t* dst, int len, int bit_depth) { | 
| 26 |   const int max_y = (1 << bit_depth) - 1; | 
| 27 |   uint64_t diff = 0; | 
| 28 |   uint32_t tmp[4]; | 
| 29 |   int i; | 
| 30 |   const __m128i zero = _mm_setzero_si128(); | 
| 31 |   const __m128i max = _mm_set1_epi16(w: max_y); | 
| 32 |   const __m128i one = _mm_set1_epi16(w: 1); | 
| 33 |   __m128i sum = zero; | 
| 34 |  | 
| 35 |   for (i = 0; i + 8 <= len; i += 8) { | 
| 36 |     const __m128i A = _mm_loadu_si128(p: (const __m128i*)(ref + i)); | 
| 37 |     const __m128i B = _mm_loadu_si128(p: (const __m128i*)(src + i)); | 
| 38 |     const __m128i C = _mm_loadu_si128(p: (const __m128i*)(dst + i)); | 
| 39 |     const __m128i D = _mm_sub_epi16(a: A, b: B);       // diff_y | 
| 40 |     const __m128i E = _mm_cmpgt_epi16(a: zero, b: D);  // sign (-1 or 0) | 
| 41 |     const __m128i F = _mm_add_epi16(a: C, b: D);       // new_y | 
| 42 |     const __m128i G = _mm_or_si128(a: E, b: one);      // -1 or 1 | 
| 43 |     const __m128i H = _mm_max_epi16(a: _mm_min_epi16(a: F, b: max), b: zero); | 
| 44 |     const __m128i I = _mm_madd_epi16(a: D, b: G);      // sum(abs(...)) | 
| 45 |     _mm_storeu_si128(p: (__m128i*)(dst + i), b: H); | 
| 46 |     sum = _mm_add_epi32(a: sum, b: I); | 
| 47 |   } | 
| 48 |   _mm_storeu_si128(p: (__m128i*)tmp, b: sum); | 
| 49 |   diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; | 
| 50 |   for (; i < len; ++i) { | 
| 51 |     const int diff_y = ref[i] - src[i]; | 
| 52 |     const int new_y = (int)dst[i] + diff_y; | 
| 53 |     dst[i] = clip_SSE2(v: new_y, max: max_y); | 
| 54 |     diff += (uint64_t)abs(x: diff_y); | 
| 55 |   } | 
| 56 |   return diff; | 
| 57 | } | 
| 58 |  | 
| 59 | static void SharpYuvUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, | 
| 60 |                                    int16_t* dst, int len) { | 
| 61 |   int i = 0; | 
| 62 |   for (i = 0; i + 8 <= len; i += 8) { | 
| 63 |     const __m128i A = _mm_loadu_si128(p: (const __m128i*)(ref + i)); | 
| 64 |     const __m128i B = _mm_loadu_si128(p: (const __m128i*)(src + i)); | 
| 65 |     const __m128i C = _mm_loadu_si128(p: (const __m128i*)(dst + i)); | 
| 66 |     const __m128i D = _mm_sub_epi16(a: A, b: B);   // diff_uv | 
| 67 |     const __m128i E = _mm_add_epi16(a: C, b: D);   // new_uv | 
| 68 |     _mm_storeu_si128(p: (__m128i*)(dst + i), b: E); | 
| 69 |   } | 
| 70 |   for (; i < len; ++i) { | 
| 71 |     const int diff_uv = ref[i] - src[i]; | 
| 72 |     dst[i] += diff_uv; | 
| 73 |   } | 
| 74 | } | 
| 75 |  | 
| 76 | static void SharpYuvFilterRow16_SSE2(const int16_t* A, const int16_t* B, | 
| 77 |                                      int len, const uint16_t* best_y, | 
| 78 |                                      uint16_t* out, int bit_depth) { | 
| 79 |   const int max_y = (1 << bit_depth) - 1; | 
| 80 |   int i; | 
| 81 |   const __m128i kCst8 = _mm_set1_epi16(w: 8); | 
| 82 |   const __m128i max = _mm_set1_epi16(w: max_y); | 
| 83 |   const __m128i zero = _mm_setzero_si128(); | 
| 84 |   for (i = 0; i + 8 <= len; i += 8) { | 
| 85 |     const __m128i a0 = _mm_loadu_si128(p: (const __m128i*)(A + i + 0)); | 
| 86 |     const __m128i a1 = _mm_loadu_si128(p: (const __m128i*)(A + i + 1)); | 
| 87 |     const __m128i b0 = _mm_loadu_si128(p: (const __m128i*)(B + i + 0)); | 
| 88 |     const __m128i b1 = _mm_loadu_si128(p: (const __m128i*)(B + i + 1)); | 
| 89 |     const __m128i a0b1 = _mm_add_epi16(a: a0, b: b1); | 
| 90 |     const __m128i a1b0 = _mm_add_epi16(a: a1, b: b0); | 
| 91 |     const __m128i a0a1b0b1 = _mm_add_epi16(a: a0b1, b: a1b0);  // A0+A1+B0+B1 | 
| 92 |     const __m128i a0a1b0b1_8 = _mm_add_epi16(a: a0a1b0b1, b: kCst8); | 
| 93 |     const __m128i a0b1_2 = _mm_add_epi16(a: a0b1, b: a0b1);    // 2*(A0+B1) | 
| 94 |     const __m128i a1b0_2 = _mm_add_epi16(a: a1b0, b: a1b0);    // 2*(A1+B0) | 
| 95 |     const __m128i c0 = _mm_srai_epi16(a: _mm_add_epi16(a: a0b1_2, b: a0a1b0b1_8), count: 3); | 
| 96 |     const __m128i c1 = _mm_srai_epi16(a: _mm_add_epi16(a: a1b0_2, b: a0a1b0b1_8), count: 3); | 
| 97 |     const __m128i d0 = _mm_add_epi16(a: c1, b: a0); | 
| 98 |     const __m128i d1 = _mm_add_epi16(a: c0, b: a1); | 
| 99 |     const __m128i e0 = _mm_srai_epi16(a: d0, count: 1); | 
| 100 |     const __m128i e1 = _mm_srai_epi16(a: d1, count: 1); | 
| 101 |     const __m128i f0 = _mm_unpacklo_epi16(a: e0, b: e1); | 
| 102 |     const __m128i f1 = _mm_unpackhi_epi16(a: e0, b: e1); | 
| 103 |     const __m128i g0 = _mm_loadu_si128(p: (const __m128i*)(best_y + 2 * i + 0)); | 
| 104 |     const __m128i g1 = _mm_loadu_si128(p: (const __m128i*)(best_y + 2 * i + 8)); | 
| 105 |     const __m128i h0 = _mm_add_epi16(a: g0, b: f0); | 
| 106 |     const __m128i h1 = _mm_add_epi16(a: g1, b: f1); | 
| 107 |     const __m128i i0 = _mm_max_epi16(a: _mm_min_epi16(a: h0, b: max), b: zero); | 
| 108 |     const __m128i i1 = _mm_max_epi16(a: _mm_min_epi16(a: h1, b: max), b: zero); | 
| 109 |     _mm_storeu_si128(p: (__m128i*)(out + 2 * i + 0), b: i0); | 
| 110 |     _mm_storeu_si128(p: (__m128i*)(out + 2 * i + 8), b: i1); | 
| 111 |   } | 
| 112 |   for (; i < len; ++i) { | 
| 113 |     //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = | 
| 114 |     // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 | 
| 115 |     // We reuse the common sub-expressions. | 
| 116 |     const int a0b1 = A[i + 0] + B[i + 1]; | 
| 117 |     const int a1b0 = A[i + 1] + B[i + 0]; | 
| 118 |     const int a0a1b0b1 = a0b1 + a1b0 + 8; | 
| 119 |     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; | 
| 120 |     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; | 
| 121 |     out[2 * i + 0] = clip_SSE2(v: best_y[2 * i + 0] + v0, max: max_y); | 
| 122 |     out[2 * i + 1] = clip_SSE2(v: best_y[2 * i + 1] + v1, max: max_y); | 
| 123 |   } | 
| 124 | } | 
| 125 |  | 
| 126 | static WEBP_INLINE __m128i s16_to_s32(__m128i in) { | 
| 127 |   return _mm_srai_epi32(a: _mm_unpacklo_epi16(a: in, b: in), count: 16); | 
| 128 | } | 
| 129 |  | 
| 130 | static void SharpYuvFilterRow32_SSE2(const int16_t* A, const int16_t* B, | 
| 131 |                                      int len, const uint16_t* best_y, | 
| 132 |                                      uint16_t* out, int bit_depth) { | 
| 133 |   const int max_y = (1 << bit_depth) - 1; | 
| 134 |   int i; | 
| 135 |   const __m128i kCst8 = _mm_set1_epi32(i: 8); | 
| 136 |   const __m128i max = _mm_set1_epi16(w: max_y); | 
| 137 |   const __m128i zero = _mm_setzero_si128(); | 
| 138 |   for (i = 0; i + 4 <= len; i += 4) { | 
| 139 |     const __m128i a0 = s16_to_s32(in: _mm_loadl_epi64(p: (const __m128i*)(A + i + 0))); | 
| 140 |     const __m128i a1 = s16_to_s32(in: _mm_loadl_epi64(p: (const __m128i*)(A + i + 1))); | 
| 141 |     const __m128i b0 = s16_to_s32(in: _mm_loadl_epi64(p: (const __m128i*)(B + i + 0))); | 
| 142 |     const __m128i b1 = s16_to_s32(in: _mm_loadl_epi64(p: (const __m128i*)(B + i + 1))); | 
| 143 |     const __m128i a0b1 = _mm_add_epi32(a: a0, b: b1); | 
| 144 |     const __m128i a1b0 = _mm_add_epi32(a: a1, b: b0); | 
| 145 |     const __m128i a0a1b0b1 = _mm_add_epi32(a: a0b1, b: a1b0);  // A0+A1+B0+B1 | 
| 146 |     const __m128i a0a1b0b1_8 = _mm_add_epi32(a: a0a1b0b1, b: kCst8); | 
| 147 |     const __m128i a0b1_2 = _mm_add_epi32(a: a0b1, b: a0b1);  // 2*(A0+B1) | 
| 148 |     const __m128i a1b0_2 = _mm_add_epi32(a: a1b0, b: a1b0);  // 2*(A1+B0) | 
| 149 |     const __m128i c0 = _mm_srai_epi32(a: _mm_add_epi32(a: a0b1_2, b: a0a1b0b1_8), count: 3); | 
| 150 |     const __m128i c1 = _mm_srai_epi32(a: _mm_add_epi32(a: a1b0_2, b: a0a1b0b1_8), count: 3); | 
| 151 |     const __m128i d0 = _mm_add_epi32(a: c1, b: a0); | 
| 152 |     const __m128i d1 = _mm_add_epi32(a: c0, b: a1); | 
| 153 |     const __m128i e0 = _mm_srai_epi32(a: d0, count: 1); | 
| 154 |     const __m128i e1 = _mm_srai_epi32(a: d1, count: 1); | 
| 155 |     const __m128i f0 = _mm_unpacklo_epi32(a: e0, b: e1); | 
| 156 |     const __m128i f1 = _mm_unpackhi_epi32(a: e0, b: e1); | 
| 157 |     const __m128i g = _mm_loadu_si128(p: (const __m128i*)(best_y + 2 * i + 0)); | 
| 158 |     const __m128i h_16 = _mm_add_epi16(a: g, b: _mm_packs_epi32(a: f0, b: f1)); | 
| 159 |     const __m128i final = _mm_max_epi16(a: _mm_min_epi16(a: h_16, b: max), b: zero); | 
| 160 |     _mm_storeu_si128(p: (__m128i*)(out + 2 * i + 0), b: final); | 
| 161 |   } | 
| 162 |   for (; i < len; ++i) { | 
| 163 |     //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = | 
| 164 |     // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 | 
| 165 |     // We reuse the common sub-expressions. | 
| 166 |     const int a0b1 = A[i + 0] + B[i + 1]; | 
| 167 |     const int a1b0 = A[i + 1] + B[i + 0]; | 
| 168 |     const int a0a1b0b1 = a0b1 + a1b0 + 8; | 
| 169 |     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; | 
| 170 |     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; | 
| 171 |     out[2 * i + 0] = clip_SSE2(v: best_y[2 * i + 0] + v0, max: max_y); | 
| 172 |     out[2 * i + 1] = clip_SSE2(v: best_y[2 * i + 1] + v1, max: max_y); | 
| 173 |   } | 
| 174 | } | 
| 175 |  | 
| 176 | static void SharpYuvFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, | 
| 177 |                                    const uint16_t* best_y, uint16_t* out, | 
| 178 |                                    int bit_depth) { | 
| 179 |   if (bit_depth <= 10) { | 
| 180 |     SharpYuvFilterRow16_SSE2(A, B, len, best_y, out, bit_depth); | 
| 181 |   } else { | 
| 182 |     SharpYuvFilterRow32_SSE2(A, B, len, best_y, out, bit_depth); | 
| 183 |   } | 
| 184 | } | 
| 185 |  | 
| 186 | //------------------------------------------------------------------------------ | 
| 187 |  | 
| 188 | extern void InitSharpYuvSSE2(void); | 
| 189 |  | 
| 190 | WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) { | 
| 191 |   SharpYuvUpdateY = SharpYuvUpdateY_SSE2; | 
| 192 |   SharpYuvUpdateRGB = SharpYuvUpdateRGB_SSE2; | 
| 193 |   SharpYuvFilterRow = SharpYuvFilterRow_SSE2; | 
| 194 | } | 
| 195 | #else  // !WEBP_USE_SSE2 | 
| 196 |  | 
| 197 | extern void InitSharpYuvSSE2(void); | 
| 198 |  | 
| 199 | void InitSharpYuvSSE2(void) {} | 
| 200 |  | 
| 201 | #endif  // WEBP_USE_SSE2 | 
| 202 |  |