| 1 | /* | 
| 2 | Copyright 2018 Google Inc. All Rights Reserved. | 
| 3 |  | 
| 4 | Licensed under the Apache License, Version 2.0 (the "License"); | 
| 5 | you may not use this file except in compliance with the License. | 
| 6 | You may obtain a copy of the License at | 
| 7 |  | 
| 8 |     http://www.apache.org/licenses/LICENSE-2.0 | 
| 9 |  | 
| 10 | Unless required by applicable law or agreed to in writing, software | 
| 11 | distributed under the License is distributed on an "AS-IS" BASIS, | 
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 13 | See the License for the specific language governing permissions and | 
| 14 | limitations under the License. | 
| 15 | */ | 
| 16 |  | 
| 17 | // Prevent Visual Studio from complaining about std::copy_n. | 
| 18 | #if defined(_WIN32) | 
| 19 | #define _SCL_SECURE_NO_WARNINGS | 
| 20 | #endif | 
| 21 |  | 
| 22 | #include "base/simd_utils.h" | 
| 23 |  | 
| 24 | #include <algorithm> | 
| 25 | #include <limits> | 
| 26 |  | 
| 27 | #include "base/constants_and_types.h" | 
| 28 | #include "base/logging.h" | 
| 29 | #include "base/misc_math.h" | 
| 30 | #include "base/simd_macros.h" | 
| 31 |  | 
| 32 |  | 
| 33 | namespace vraudio { | 
| 34 |  | 
| 35 | namespace { | 
| 36 |  | 
| 37 | #ifdef SIMD_NEON | 
| 38 | // Deinterleaving operates on 8 int16s at a time. | 
| 39 | const size_t kSixteenBitSimdLength = SIMD_LENGTH * 2; | 
| 40 | #endif  // SIMD_NEON | 
| 41 |  | 
| 42 | // Float format of max and min values storable in an int16_t, for clamping. | 
| 43 | const float kInt16Max = static_cast<float>(0x7FFF); | 
| 44 | const float kInt16Min = static_cast<float>(-0x7FFF); | 
| 45 |  | 
| 46 | // Conversion factors between float and int16_t (both directions). | 
| 47 | const float kFloatFromInt16 = 1.0f / kInt16Max; | 
| 48 | const float kInt16FromFloat = kInt16Max; | 
| 49 |  | 
| 50 | // Expected SIMD alignment in bytes. | 
| 51 | const size_t kSimdSizeBytes = 16; | 
| 52 |  | 
| 53 | inline size_t GetNumChunks(size_t length) { return length / SIMD_LENGTH; } | 
| 54 |  | 
| 55 | inline size_t GetLeftoverSamples(size_t length) { return length % SIMD_LENGTH; } | 
| 56 |  | 
| 57 | template <typename T> | 
| 58 | inline bool IsAlignedTemplated(const T* pointer) { | 
| 59 |   return reinterpret_cast<uintptr_t>(pointer) % kSimdSizeBytes == 0; | 
| 60 | } | 
| 61 |  | 
| 62 | #ifdef SIMD_DISABLED | 
| 63 | // Calculates the approximate complex magnude of z = real + i * imaginary. | 
| 64 | inline void ComplexMagnitude(float real, float imaginary, float* output) { | 
| 65 |   *output = real * real + imaginary * imaginary; | 
| 66 |   // The value of |output| is not being recalculated, simply modified. | 
| 67 |   *output = 1.0f / FastReciprocalSqrt(*output); | 
| 68 | } | 
| 69 | #endif  // defined(SIMD_DISABLED) | 
| 70 |  | 
| 71 | }  // namespace | 
| 72 |  | 
| 73 | bool IsAligned(const float* pointer) { | 
| 74 |   return IsAlignedTemplated<float>(pointer); | 
| 75 | } | 
| 76 |  | 
| 77 | bool IsAligned(const int16_t* pointer) { | 
| 78 |   return IsAlignedTemplated<int16_t>(pointer); | 
| 79 | } | 
| 80 |  | 
| 81 | size_t FindNextAlignedArrayIndex(size_t length, size_t type_size_bytes, | 
| 82 |                                  size_t memory_alignment_bytes) { | 
| 83 |   const size_t byte_length = type_size_bytes * length; | 
| 84 |   const size_t unaligned_bytes = byte_length % memory_alignment_bytes; | 
| 85 |   const size_t bytes_to_next_aligned = | 
| 86 |       (unaligned_bytes == 0) ? 0 : memory_alignment_bytes - unaligned_bytes; | 
| 87 |   return (byte_length + bytes_to_next_aligned) / type_size_bytes; | 
| 88 | } | 
| 89 |  | 
| 90 | void AddPointwise(size_t length, const float* input_a, const float* input_b, | 
| 91 |                   float* output) { | 
| 92 |   DCHECK(input_a); | 
| 93 |   DCHECK(input_b); | 
| 94 |   DCHECK(output); | 
| 95 |  | 
| 96 |   const SimdVector* input_a_vector = | 
| 97 |       reinterpret_cast<const SimdVector*>(input_a); | 
| 98 |   const SimdVector* input_b_vector = | 
| 99 |       reinterpret_cast<const SimdVector*>(input_b); | 
| 100 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 101 | #ifdef SIMD_SSE | 
| 102 |   const size_t num_chunks = GetNumChunks(length); | 
| 103 |   const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); | 
| 104 |   const bool output_aligned = IsAligned(pointer: output); | 
| 105 |   if (inputs_aligned && output_aligned) { | 
| 106 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 107 |       output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]); | 
| 108 |     } | 
| 109 |   } else if (inputs_aligned) { | 
| 110 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 111 |       const SimdVector output_temp = | 
| 112 |           SIMD_ADD(input_a_vector[i], input_b_vector[i]); | 
| 113 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 114 |     } | 
| 115 |   } else if (output_aligned) { | 
| 116 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 117 |       const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 118 |       const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 119 |       output_vector[i] = SIMD_ADD(input_a_temp, input_b_temp); | 
| 120 |     } | 
| 121 |   } else { | 
| 122 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 123 |       const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 124 |       const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 125 |       const SimdVector output_temp = SIMD_ADD(input_a_temp, input_b_temp); | 
| 126 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 127 |     } | 
| 128 |   } | 
| 129 | #else | 
| 130 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 131 |     output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]); | 
| 132 |   } | 
| 133 | #endif  // SIMD_SSE | 
| 134 |  | 
| 135 |   // Add samples at the end that were missed by the SIMD chunking. | 
| 136 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 137 |   DCHECK_GE(length, leftover_samples); | 
| 138 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 139 |     output[i] = input_a[i] + input_b[i]; | 
| 140 |   } | 
| 141 | } | 
| 142 |  | 
| 143 | void SubtractPointwise(size_t length, const float* input_a, | 
| 144 |                        const float* input_b, float* output) { | 
| 145 |   DCHECK(input_a); | 
| 146 |   DCHECK(input_b); | 
| 147 |   DCHECK(output); | 
| 148 |  | 
| 149 |   const SimdVector* input_a_vector = | 
| 150 |       reinterpret_cast<const SimdVector*>(input_a); | 
| 151 |   const SimdVector* input_b_vector = | 
| 152 |       reinterpret_cast<const SimdVector*>(input_b); | 
| 153 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 154 |  | 
| 155 | #ifdef SIMD_SSE | 
| 156 |   const size_t num_chunks = GetNumChunks(length); | 
| 157 |   const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); | 
| 158 |   const bool output_aligned = IsAligned(pointer: output); | 
| 159 |   if (inputs_aligned && output_aligned) { | 
| 160 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 161 |       output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]); | 
| 162 |     } | 
| 163 |   } else if (inputs_aligned) { | 
| 164 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 165 |       const SimdVector output_temp = | 
| 166 |           SIMD_SUB(input_b_vector[i], input_a_vector[i]); | 
| 167 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 168 |     } | 
| 169 |   } else if (output_aligned) { | 
| 170 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 171 |       const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 172 |       const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 173 |       output_vector[i] = SIMD_SUB(input_b_temp, input_a_temp); | 
| 174 |     } | 
| 175 |   } else { | 
| 176 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 177 |       const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 178 |       const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 179 |       const SimdVector output_temp = SIMD_SUB(input_b_temp, input_a_temp); | 
| 180 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 181 |     } | 
| 182 |   } | 
| 183 | #else | 
| 184 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 185 |     output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]); | 
| 186 |   } | 
| 187 | #endif  // SIMD_SSE | 
| 188 |  | 
| 189 |   // Subtract samples at the end that were missed by the SIMD chunking. | 
| 190 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 191 |   DCHECK_GE(length, leftover_samples); | 
| 192 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 193 |     output[i] = input_b[i] - input_a[i]; | 
| 194 |   } | 
| 195 | } | 
| 196 |  | 
| 197 | void MultiplyPointwise(size_t length, const float* input_a, | 
| 198 |                        const float* input_b, float* output) { | 
| 199 |   DCHECK(input_a); | 
| 200 |   DCHECK(input_b); | 
| 201 |   DCHECK(output); | 
| 202 |  | 
| 203 |   const SimdVector* input_a_vector = | 
| 204 |       reinterpret_cast<const SimdVector*>(input_a); | 
| 205 |   const SimdVector* input_b_vector = | 
| 206 |       reinterpret_cast<const SimdVector*>(input_b); | 
| 207 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 208 |  | 
| 209 | #ifdef SIMD_SSE | 
| 210 |   const size_t num_chunks = GetNumChunks(length); | 
| 211 |   const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); | 
| 212 |   const bool output_aligned = IsAligned(pointer: output); | 
| 213 |   if (inputs_aligned && output_aligned) { | 
| 214 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 215 |       output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); | 
| 216 |     } | 
| 217 |   } else if (inputs_aligned) { | 
| 218 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 219 |       const SimdVector output_temp = | 
| 220 |           SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); | 
| 221 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 222 |     } | 
| 223 |   } else if (output_aligned) { | 
| 224 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 225 |       const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 226 |       const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 227 |       output_vector[i] = SIMD_MULTIPLY(input_a_temp, input_b_temp); | 
| 228 |     } | 
| 229 |   } else { | 
| 230 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 231 |       const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 232 |       const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 233 |       const SimdVector output_temp = SIMD_MULTIPLY(input_a_temp, input_b_temp); | 
| 234 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 235 |     } | 
| 236 |   } | 
| 237 | #else | 
| 238 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 239 |     output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); | 
| 240 |   } | 
| 241 | #endif  // SIMD_SSE | 
| 242 |  | 
| 243 |   // Multiply samples at the end that were missed by the SIMD chunking. | 
| 244 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 245 |   DCHECK_GE(length, leftover_samples); | 
| 246 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 247 |     output[i] = input_a[i] * input_b[i]; | 
| 248 |   } | 
| 249 | } | 
| 250 |  | 
| 251 | void MultiplyAndAccumulatePointwise(size_t length, const float* input_a, | 
| 252 |                                     const float* input_b, float* accumulator) { | 
| 253 |   DCHECK(input_a); | 
| 254 |   DCHECK(input_b); | 
| 255 |   DCHECK(accumulator); | 
| 256 |  | 
| 257 |   const SimdVector* input_a_vector = | 
| 258 |       reinterpret_cast<const SimdVector*>(input_a); | 
| 259 |   const SimdVector* input_b_vector = | 
| 260 |       reinterpret_cast<const SimdVector*>(input_b); | 
| 261 |   SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator); | 
| 262 |  | 
| 263 | #ifdef SIMD_SSE | 
| 264 |   const size_t num_chunks = GetNumChunks(length); | 
| 265 |   const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); | 
| 266 |   const bool accumulator_aligned = IsAligned(pointer: accumulator); | 
| 267 |   if (inputs_aligned && accumulator_aligned) { | 
| 268 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 269 |       accumulator_vector[i] = SIMD_MULTIPLY_ADD( | 
| 270 |           input_a_vector[i], input_b_vector[i], accumulator_vector[i]); | 
| 271 |     } | 
| 272 |   } else if (inputs_aligned) { | 
| 273 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 274 |       SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); | 
| 275 |       accumulator_temp = SIMD_MULTIPLY_ADD(input_a_vector[i], input_b_vector[i], | 
| 276 |                                            accumulator_temp); | 
| 277 |       _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); | 
| 278 |     } | 
| 279 |   } else if (accumulator_aligned) { | 
| 280 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 281 |       const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 282 |       const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 283 |       accumulator_vector[i] = | 
| 284 |           SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_vector[i]); | 
| 285 |     } | 
| 286 |   } else { | 
| 287 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 288 |       const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); | 
| 289 |       const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); | 
| 290 |       SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); | 
| 291 |       accumulator_temp = | 
| 292 |           SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_temp); | 
| 293 |       _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); | 
| 294 |     } | 
| 295 |   } | 
| 296 | #else | 
| 297 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 298 |     accumulator_vector[i] = SIMD_MULTIPLY_ADD( | 
| 299 |         input_a_vector[i], input_b_vector[i], accumulator_vector[i]); | 
| 300 |   } | 
| 301 | #endif  // SIMD_SSE | 
| 302 |  | 
| 303 |   // Apply gain and accumulate to samples at the end that were missed by the | 
| 304 |   // SIMD chunking. | 
| 305 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 306 |   DCHECK_GE(length, leftover_samples); | 
| 307 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 308 |     accumulator[i] += input_a[i] * input_b[i]; | 
| 309 |   } | 
| 310 | } | 
| 311 |  | 
| 312 | void ScalarMultiply(size_t length, float gain, const float* input, | 
| 313 |                     float* output) { | 
| 314 |   DCHECK(input); | 
| 315 |   DCHECK(output); | 
| 316 |  | 
| 317 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 318 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 319 |  | 
| 320 |   const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain); | 
| 321 | #ifdef SIMD_SSE | 
| 322 |   const size_t num_chunks = GetNumChunks(length); | 
| 323 |   const bool input_aligned = IsAligned(pointer: input); | 
| 324 |   const bool output_aligned = IsAligned(pointer: output); | 
| 325 |   if (input_aligned && output_aligned) { | 
| 326 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 327 |       output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]); | 
| 328 |     } | 
| 329 |   } else if (input_aligned) { | 
| 330 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 331 |       const SimdVector output_temp = | 
| 332 |           SIMD_MULTIPLY(gain_vector, input_vector[i]); | 
| 333 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 334 |     } | 
| 335 |   } else if (output_aligned) { | 
| 336 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 337 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 338 |       output_vector[i] = SIMD_MULTIPLY(gain_vector, input_temp); | 
| 339 |     } | 
| 340 |   } else { | 
| 341 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 342 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 343 |       const SimdVector output_temp = SIMD_MULTIPLY(gain_vector, input_temp); | 
| 344 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 345 |     } | 
| 346 |   } | 
| 347 | #else | 
| 348 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 349 |     output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]); | 
| 350 |   } | 
| 351 | #endif  // SIMD_SSE | 
| 352 |  | 
| 353 |   // Apply gain to samples at the end that were missed by the SIMD chunking. | 
| 354 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 355 |   DCHECK_GE(length, leftover_samples); | 
| 356 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 357 |     output[i] = input[i] * gain; | 
| 358 |   } | 
| 359 | } | 
| 360 |  | 
| 361 | void ScalarMultiplyAndAccumulate(size_t length, float gain, const float* input, | 
| 362 |                                  float* accumulator) { | 
| 363 |   DCHECK(input); | 
| 364 |   DCHECK(accumulator); | 
| 365 |  | 
| 366 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 367 |   SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator); | 
| 368 |  | 
| 369 |   const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain); | 
| 370 | #ifdef SIMD_SSE | 
| 371 |   const size_t num_chunks = GetNumChunks(length); | 
| 372 |   const bool input_aligned = IsAligned(pointer: input); | 
| 373 |   const bool accumulator_aligned = IsAligned(pointer: accumulator); | 
| 374 |   if (input_aligned && accumulator_aligned) { | 
| 375 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 376 |       accumulator_vector[i] = SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], | 
| 377 |                                                 accumulator_vector[i]); | 
| 378 |     } | 
| 379 |   } else if (input_aligned) { | 
| 380 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 381 |       SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); | 
| 382 |       accumulator_temp = | 
| 383 |           SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_temp); | 
| 384 |       _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); | 
| 385 |     } | 
| 386 |   } else if (accumulator_aligned) { | 
| 387 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 388 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 389 |       accumulator_vector[i] = | 
| 390 |           SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_vector[i]); | 
| 391 |     } | 
| 392 |   } else { | 
| 393 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 394 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 395 |       SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); | 
| 396 |       accumulator_temp = | 
| 397 |           SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_temp); | 
| 398 |       _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); | 
| 399 |     } | 
| 400 |   } | 
| 401 | #else | 
| 402 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 403 |     accumulator_vector[i] = | 
| 404 |         SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_vector[i]); | 
| 405 |   } | 
| 406 | #endif  // SIMD_SSE | 
| 407 |  | 
| 408 |   // Apply gain and accumulate to samples at the end that were missed by the | 
| 409 |   // SIMD chunking. | 
| 410 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 411 |   DCHECK_GE(length, leftover_samples); | 
| 412 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 413 |     accumulator[i] += input[i] * gain; | 
| 414 |   } | 
| 415 | } | 
| 416 |  | 
| 417 | void ReciprocalSqrt(size_t length, const float* input, float* output) { | 
| 418 |   DCHECK(input); | 
| 419 |   DCHECK(output); | 
| 420 |  | 
| 421 | #if !defined(SIMD_DISABLED) | 
| 422 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 423 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 424 | #endif  // !defined(SIMD_DISABLED) | 
| 425 |  | 
| 426 | #ifdef SIMD_SSE | 
| 427 |   const size_t num_chunks = GetNumChunks(length); | 
| 428 |   const bool input_aligned = IsAligned(pointer: input); | 
| 429 |   const bool output_aligned = IsAligned(pointer: output); | 
| 430 |   if (input_aligned && output_aligned) { | 
| 431 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 432 |       output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]); | 
| 433 |     } | 
| 434 |   } else if (input_aligned) { | 
| 435 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 436 |       const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_vector[i]); | 
| 437 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 438 |     } | 
| 439 |   } else if (output_aligned) { | 
| 440 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 441 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 442 |       output_vector[i] = SIMD_RECIPROCAL_SQRT(input_temp); | 
| 443 |     } | 
| 444 |   } else { | 
| 445 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 446 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 447 |       const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_temp); | 
| 448 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 449 |     } | 
| 450 |   } | 
| 451 | #elif defined SIMD_NEON | 
| 452 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 453 |     output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]); | 
| 454 |   } | 
| 455 | #endif  // SIMD_SSE | 
| 456 |  | 
| 457 |   // Apply to samples at the end that were missed by the SIMD chunking. | 
| 458 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 459 |   DCHECK_GE(length, leftover_samples); | 
| 460 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 461 |     output[i] = FastReciprocalSqrt(input: input[i]); | 
| 462 |   } | 
| 463 | } | 
| 464 |  | 
| 465 | void Sqrt(size_t length, const float* input, float* output) { | 
| 466 |   DCHECK(input); | 
| 467 |   DCHECK(output); | 
| 468 |  | 
| 469 | #if !defined(SIMD_DISABLED) | 
| 470 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 471 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 472 | #endif  // !defined(SIMD_DISABLED) | 
| 473 |  | 
| 474 | #ifdef SIMD_SSE | 
| 475 |   const size_t num_chunks = GetNumChunks(length); | 
| 476 |   const bool input_aligned = IsAligned(pointer: input); | 
| 477 |   const bool output_aligned = IsAligned(pointer: output); | 
| 478 |   if (input_aligned && output_aligned) { | 
| 479 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 480 |       output_vector[i] = SIMD_SQRT(input_vector[i]); | 
| 481 |     } | 
| 482 |   } else if (input_aligned) { | 
| 483 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 484 |       const SimdVector output_temp = SIMD_SQRT(input_vector[i]); | 
| 485 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 486 |     } | 
| 487 |   } else if (output_aligned) { | 
| 488 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 489 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 490 |       output_vector[i] = SIMD_SQRT(input_temp); | 
| 491 |     } | 
| 492 |   } else { | 
| 493 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 494 |       const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 495 |       const SimdVector output_temp = SIMD_SQRT(input_temp); | 
| 496 |       _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); | 
| 497 |     } | 
| 498 |   } | 
| 499 | #elif defined SIMD_NEON | 
| 500 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 501 |     // This should be faster than using a sqrt method : https://goo.gl/XRKwFp | 
| 502 |     output_vector[i] = SIMD_SQRT(input_vector[i]); | 
| 503 |   } | 
| 504 | #endif  // SIMD_SSE | 
| 505 |  | 
| 506 |   // Apply to samples at the end that were missed by the SIMD chunking. | 
| 507 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 508 |   DCHECK_GE(length, leftover_samples); | 
| 509 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 510 |     output[i] = 1.0f / FastReciprocalSqrt(input: input[i]); | 
| 511 |   } | 
| 512 | } | 
| 513 |  | 
| 514 | void ApproxComplexMagnitude(size_t length, const float* input, float* output) { | 
| 515 |   DCHECK(input); | 
| 516 |   DCHECK(output); | 
| 517 |  | 
| 518 | #if !defined(SIMD_DISABLED) | 
| 519 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 520 |   SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 521 |   const size_t num_chunks = GetNumChunks(length); | 
| 522 |   const bool input_aligned = IsAligned(pointer: input); | 
| 523 |   const bool output_aligned = IsAligned(pointer: output); | 
| 524 | #endif  // !defined(SIMD_DISABLED) | 
| 525 |  | 
| 526 | #ifdef SIMD_SSE | 
| 527 |   if (input_aligned && output_aligned) { | 
| 528 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 529 |       const size_t first_index = out_index * 2; | 
| 530 |       const size_t second_index = first_index + 1; | 
| 531 |       const SimdVector squared_1 = | 
| 532 |           SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); | 
| 533 |       const SimdVector squared_2 = | 
| 534 |           SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); | 
| 535 |       const SimdVector unshuffled_1 = | 
| 536 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); | 
| 537 |       const SimdVector unshuffled_2 = | 
| 538 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); | 
| 539 |       output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2); | 
| 540 |       output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); | 
| 541 |     } | 
| 542 |   } else if (input_aligned) { | 
| 543 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 544 |       const size_t first_index = out_index * 2; | 
| 545 |       const size_t second_index = first_index + 1; | 
| 546 |       const SimdVector squared_1 = | 
| 547 |           SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); | 
| 548 |       const SimdVector squared_2 = | 
| 549 |           SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); | 
| 550 |       const SimdVector unshuffled_1 = | 
| 551 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); | 
| 552 |       const SimdVector unshuffled_2 = | 
| 553 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); | 
| 554 |       SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2); | 
| 555 |       output_vector[out_index] = SIMD_SQRT(output_temp); | 
| 556 |       _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp); | 
| 557 |     } | 
| 558 |   } else if (output_aligned) { | 
| 559 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 560 |       const size_t first_index = out_index * 2; | 
| 561 |       const size_t second_index = first_index + 1; | 
| 562 |       const SimdVector first_temp = | 
| 563 |           _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]); | 
| 564 |       const SimdVector second_temp = | 
| 565 |           _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]); | 
| 566 |       const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); | 
| 567 |       const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); | 
| 568 |       const SimdVector unshuffled_1 = | 
| 569 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); | 
| 570 |       const SimdVector unshuffled_2 = | 
| 571 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); | 
| 572 |       output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2); | 
| 573 |       output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); | 
| 574 |     } | 
| 575 |   } else { | 
| 576 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 577 |       const size_t first_index = out_index * 2; | 
| 578 |       const size_t second_index = first_index + 1; | 
| 579 |       const SimdVector first_temp = | 
| 580 |           _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]); | 
| 581 |       const SimdVector second_temp = | 
| 582 |           _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]); | 
| 583 |       const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); | 
| 584 |       const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); | 
| 585 |       const SimdVector unshuffled_1 = | 
| 586 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); | 
| 587 |       const SimdVector unshuffled_2 = | 
| 588 |           _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); | 
| 589 |       SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2); | 
| 590 |       output_temp = SIMD_SQRT(output_temp); | 
| 591 |       _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp); | 
| 592 |     } | 
| 593 |   } | 
| 594 | #elif defined SIMD_NEON | 
| 595 |   if (input_aligned && output_aligned) { | 
| 596 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 597 |       const size_t first_index = out_index * 2; | 
| 598 |       const size_t second_index = first_index + 1; | 
| 599 |       const SimdVector squared_1 = | 
| 600 |           SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); | 
| 601 |       const SimdVector squared_2 = | 
| 602 |           SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); | 
| 603 |       const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); | 
| 604 |       output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); | 
| 605 |       output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); | 
| 606 |     } | 
| 607 |   } else if (input_aligned) { | 
| 608 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 609 |       const size_t first_index = out_index * 2; | 
| 610 |       const size_t second_index = first_index + 1; | 
| 611 |       const SimdVector squared_1 = | 
| 612 |           SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); | 
| 613 |       const SimdVector squared_2 = | 
| 614 |           SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); | 
| 615 |       const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); | 
| 616 |       SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); | 
| 617 |       output_temp = SIMD_SQRT(output_temp); | 
| 618 |       vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp); | 
| 619 |     } | 
| 620 |   } else if (output_aligned) { | 
| 621 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 622 |       const size_t first_index = out_index * 2; | 
| 623 |       const size_t second_index = first_index + 1; | 
| 624 |       const SimdVector first_temp = | 
| 625 |           vld1q_f32(&input[first_index * SIMD_LENGTH]); | 
| 626 |       const SimdVector second_temp = | 
| 627 |           vld1q_f32(&input[second_index * SIMD_LENGTH]); | 
| 628 |       const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); | 
| 629 |       const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); | 
| 630 |       const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); | 
| 631 |       output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); | 
| 632 |       output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); | 
| 633 |     } | 
| 634 |   } else { | 
| 635 |     for (size_t out_index = 0; out_index < num_chunks; ++out_index) { | 
| 636 |       const size_t first_index = out_index * 2; | 
| 637 |       const size_t second_index = first_index + 1; | 
| 638 |       const SimdVector first_temp = | 
| 639 |           vld1q_f32(&input[first_index * SIMD_LENGTH]); | 
| 640 |       const SimdVector second_temp = | 
| 641 |           vld1q_f32(&input[second_index * SIMD_LENGTH]); | 
| 642 |       const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); | 
| 643 |       const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); | 
| 644 |       const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); | 
| 645 |       SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); | 
| 646 |       output_temp = SIMD_SQRT(output_temp); | 
| 647 |       vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp); | 
| 648 |     } | 
| 649 |   } | 
| 650 | #endif  // SIMD_SSE | 
| 651 |  | 
| 652 |   // Apply to samples at the end that were missed by the SIMD chunking. | 
| 653 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 654 |   DCHECK_GE(length, leftover_samples); | 
| 655 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 656 |     const size_t real_index = i * 2; | 
| 657 |     const size_t imag_index = real_index + 1; | 
| 658 |     const float squared_sum = (input[real_index] * input[real_index]) + | 
| 659 |                               (input[imag_index] * input[imag_index]); | 
| 660 |     output[i] = 1.0f / FastReciprocalSqrt(input: squared_sum); | 
| 661 |   } | 
| 662 | } | 
| 663 |  | 
| 664 | void ComplexInterleavedFormatFromMagnitudeAndSinCosPhase( | 
| 665 |     size_t length, const float* magnitude, const float* cos_phase, | 
| 666 |     const float* sin_phase, float* complex_interleaved_format_output) { | 
| 667 |   size_t leftover_samples = 0; | 
| 668 | #ifdef SIMD_NEON | 
| 669 |   if (IsAligned(complex_interleaved_format_output) && IsAligned(cos_phase) && | 
| 670 |       IsAligned(sin_phase) && IsAligned(magnitude)) { | 
| 671 |     const SimdVector* cos_vec = reinterpret_cast<const SimdVector*>(cos_phase); | 
| 672 |     const SimdVector* sin_vec = reinterpret_cast<const SimdVector*>(sin_phase); | 
| 673 |     const SimdVector* magnitude_vec = | 
| 674 |         reinterpret_cast<const SimdVector*>(magnitude); | 
| 675 |  | 
| 676 |     const size_t num_chunks = GetNumChunks(length); | 
| 677 |     float32x4x2_t interleaved_pair; | 
| 678 |  | 
| 679 |     SimdVector* interleaved_vec = | 
| 680 |         reinterpret_cast<SimdVector*>(complex_interleaved_format_output); | 
| 681 |     for (size_t i = 0, j = 0; j < num_chunks; ++i, j += 2) { | 
| 682 |       interleaved_pair = vzipq_f32(cos_vec[i], sin_vec[i]); | 
| 683 |       interleaved_vec[j] = | 
| 684 |           SIMD_MULTIPLY(interleaved_pair.val[0], magnitude_vec[i]); | 
| 685 |       interleaved_vec[j + 1] = | 
| 686 |           SIMD_MULTIPLY(interleaved_pair.val[1], magnitude_vec[i]); | 
| 687 |     } | 
| 688 |  | 
| 689 |     leftover_samples = GetLeftoverSamples(length); | 
| 690 |   } | 
| 691 | #endif  // SIMD_NEON | 
| 692 |   DCHECK_EQ(leftover_samples % 2U, 0U); | 
| 693 |   for (size_t i = leftover_samples, j = leftover_samples / 2; i < length; | 
| 694 |        i += 2, ++j) { | 
| 695 |     const size_t imaginary_offset = i + 1; | 
| 696 |     complex_interleaved_format_output[i] = magnitude[j] * cos_phase[j]; | 
| 697 |     complex_interleaved_format_output[imaginary_offset] = | 
| 698 |         magnitude[j] * sin_phase[j]; | 
| 699 |   } | 
| 700 | } | 
| 701 |  | 
| 702 | void StereoFromMonoSimd(size_t length, const float* mono, float* left, | 
| 703 |                         float* right) { | 
| 704 |   ScalarMultiply(length, gain: kInverseSqrtTwo, input: mono, output: left); | 
| 705 |   std::copy_n(first: left, n: length, result: right); | 
| 706 | } | 
| 707 |  | 
| 708 | void MonoFromStereoSimd(size_t length, const float* left, const float* right, | 
| 709 |                         float* mono) { | 
| 710 |   DCHECK(left); | 
| 711 |   DCHECK(right); | 
| 712 |   DCHECK(mono); | 
| 713 |  | 
| 714 |   const SimdVector* left_vector = reinterpret_cast<const SimdVector*>(left); | 
| 715 |   const SimdVector* right_vector = reinterpret_cast<const SimdVector*>(right); | 
| 716 |   SimdVector* mono_vector = reinterpret_cast<SimdVector*>(mono); | 
| 717 |  | 
| 718 |   const SimdVector inv_root_two_vec = SIMD_LOAD_ONE_FLOAT(kInverseSqrtTwo); | 
| 719 | #ifdef SIMD_SSE | 
| 720 |   const size_t num_chunks = GetNumChunks(length); | 
| 721 |   const bool inputs_aligned = IsAligned(pointer: left) && IsAligned(pointer: right); | 
| 722 |   const bool mono_aligned = IsAligned(pointer: mono); | 
| 723 |   if (inputs_aligned && mono_aligned) { | 
| 724 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 725 |       mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec, | 
| 726 |                                      SIMD_ADD(left_vector[i], right_vector[i])); | 
| 727 |     } | 
| 728 |   } else if (inputs_aligned) { | 
| 729 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 730 |       const SimdVector mono_temp = SIMD_MULTIPLY( | 
| 731 |           inv_root_two_vec, SIMD_ADD(left_vector[i], right_vector[i])); | 
| 732 |       _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp); | 
| 733 |     } | 
| 734 |   } else if (mono_aligned) { | 
| 735 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 736 |       const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]); | 
| 737 |       const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]); | 
| 738 |       mono_vector[i] = | 
| 739 |           SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp)); | 
| 740 |     } | 
| 741 |   } else { | 
| 742 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 743 |       const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]); | 
| 744 |       const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]); | 
| 745 |       const SimdVector mono_temp = | 
| 746 |           SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp)); | 
| 747 |       _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp); | 
| 748 |     } | 
| 749 |   } | 
| 750 | #else | 
| 751 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 752 |     mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec, | 
| 753 |                                    SIMD_ADD(left_vector[i], right_vector[i])); | 
| 754 |   } | 
| 755 | #endif  // SIMD_SSE | 
| 756 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 757 |   // Downmix samples at the end that were missed by the SIMD chunking. | 
| 758 |   DCHECK_GE(length, leftover_samples); | 
| 759 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 760 |     mono[i] = kInverseSqrtTwo * (left[i] + right[i]); | 
| 761 |   } | 
| 762 | } | 
| 763 |  | 
| 764 | #ifdef SIMD_NEON | 
| 765 |  | 
| 766 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { | 
| 767 |   DCHECK(input); | 
| 768 |   DCHECK(output); | 
| 769 |  | 
| 770 |   //  if (input_aligned || output_aligned) { | 
| 771 |   const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 772 |   int16x4_t* output_vector = reinterpret_cast<int16x4_t*>(output); | 
| 773 |  | 
| 774 |   // A temporary 32 bit integer vector is needed as we only have intrinsics to | 
| 775 |   // convert from 32 bit floats to 32 bit ints. Then truncate to 16 bit ints. | 
| 776 |   int32x4_t temporary_wide_vector; | 
| 777 |   SimdVector temporary_float_vector; | 
| 778 |  | 
| 779 |   const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); | 
| 780 |  | 
| 781 |   for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 782 |     temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]); | 
| 783 |     temporary_wide_vector = vcvtq_s32_f32(temporary_float_vector); | 
| 784 |     output_vector[i] = vqmovn_s32(temporary_wide_vector); | 
| 785 |   } | 
| 786 |  | 
| 787 |   // The remainder. | 
| 788 |   const size_t leftover_samples = GetLeftoverSamples(length); | 
| 789 |   DCHECK_GE(length, leftover_samples); | 
| 790 |   float temp_float; | 
| 791 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 792 |     temp_float = input[i] * kInt16FromFloat; | 
| 793 |     temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float)); | 
| 794 |     output[i] = static_cast<int16_t>(temp_float); | 
| 795 |   } | 
| 796 | } | 
| 797 |  | 
| 798 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { | 
| 799 |   DCHECK(input); | 
| 800 |   DCHECK(output); | 
| 801 |  | 
| 802 |   size_t leftover_samples = length; | 
| 803 |   const bool input_aligned = IsAligned(input); | 
| 804 |   const bool output_aligned = IsAligned(output); | 
| 805 |   if (input_aligned || output_aligned) { | 
| 806 |     const int16x4_t* input_vector = reinterpret_cast<const int16x4_t*>(input); | 
| 807 |     SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 808 |  | 
| 809 |     int16x4_t temporary_narrow_vector; | 
| 810 |     SimdVector temporary_float_vector; | 
| 811 |     int32x4_t temporary_wide_vector; | 
| 812 |     const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); | 
| 813 |  | 
| 814 |     if (input_aligned && output_aligned) { | 
| 815 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 816 |         temporary_wide_vector = vmovl_s16(input_vector[i]); | 
| 817 |         output_vector[i] = vcvtq_f32_s32(temporary_wide_vector); | 
| 818 |         output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); | 
| 819 |       } | 
| 820 |     } else if (input_aligned) { | 
| 821 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 822 |         temporary_wide_vector = vmovl_s16(input_vector[i]); | 
| 823 |         temporary_float_vector = vcvtq_f32_s32(temporary_wide_vector); | 
| 824 |         temporary_float_vector = | 
| 825 |             SIMD_MULTIPLY(scaling_vector, temporary_float_vector); | 
| 826 |         vst1q_f32(&output[i * SIMD_LENGTH], temporary_float_vector); | 
| 827 |       } | 
| 828 |     } else { | 
| 829 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 830 |         temporary_narrow_vector = vld1_s16(&input[i * SIMD_LENGTH]); | 
| 831 |         temporary_wide_vector = vmovl_s16(temporary_narrow_vector); | 
| 832 |         output_vector[i] = vcvtq_f32_s32(temporary_wide_vector); | 
| 833 |         output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); | 
| 834 |       } | 
| 835 |     } | 
| 836 |     leftover_samples = GetLeftoverSamples(length); | 
| 837 |   } | 
| 838 |  | 
| 839 |   // The remainder. | 
| 840 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 841 |     output[i] = static_cast<float>(input[i]) * kFloatFromInt16; | 
| 842 |   } | 
| 843 | } | 
| 844 |  | 
| 845 | #elif (defined SIMD_SSE && !defined(_MSC_VER)) | 
| 846 |  | 
| 847 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { | 
| 848 |   DCHECK(input); | 
| 849 |   DCHECK(output); | 
| 850 |  | 
| 851 |   size_t leftover_samples = length; | 
| 852 |   const bool input_aligned = IsAligned(pointer: input); | 
| 853 |   const bool output_aligned = IsAligned(pointer: output); | 
| 854 |   if (output_aligned) { | 
| 855 |     const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); | 
| 856 |     __m64* output_vector = reinterpret_cast<__m64*>(output); | 
| 857 |  | 
| 858 |     const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); | 
| 859 |     const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min); | 
| 860 |     const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max); | 
| 861 |  | 
| 862 |     SimdVector temporary_float_vector; | 
| 863 |  | 
| 864 |     if (input_aligned) { | 
| 865 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 866 |         temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]); | 
| 867 |         temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector); | 
| 868 |         temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector); | 
| 869 |         output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector); | 
| 870 |       } | 
| 871 |     } else { | 
| 872 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 873 |         temporary_float_vector = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); | 
| 874 |         temporary_float_vector = | 
| 875 |             SIMD_MULTIPLY(scaling_vector, temporary_float_vector); | 
| 876 |         temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector); | 
| 877 |         temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector); | 
| 878 |         output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector); | 
| 879 |       } | 
| 880 |     } | 
| 881 |     // There is no easy way to simply store the 16 bit ints so we dont have an | 
| 882 |     // |input_aligned| only case. | 
| 883 |     leftover_samples = GetLeftoverSamples(length); | 
| 884 |   } | 
| 885 |  | 
| 886 |   // The remainder. | 
| 887 |   float temp_float; | 
| 888 |   for (size_t i = length - GetLeftoverSamples(length); i < length; ++i) { | 
| 889 |     temp_float = input[i] * kInt16FromFloat; | 
| 890 |     temp_float = std::min(a: kInt16Max, b: std::max(a: kInt16Min, b: temp_float)); | 
| 891 |     output[i] = static_cast<int16_t>(temp_float); | 
| 892 |   } | 
| 893 | } | 
| 894 |  | 
| 895 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { | 
| 896 |   DCHECK(input); | 
| 897 |   DCHECK(output); | 
| 898 |  | 
| 899 |   size_t leftover_samples = length; | 
| 900 |   const bool input_aligned = IsAligned(pointer: input); | 
| 901 |   const bool output_aligned = IsAligned(pointer: output); | 
| 902 |   if (input_aligned) { | 
| 903 |     SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); | 
| 904 |     const __m64* input_vector = reinterpret_cast<const __m64*>(input); | 
| 905 |  | 
| 906 |     const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); | 
| 907 |  | 
| 908 |     if (output_aligned) { | 
| 909 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 910 |         output_vector[i] = _mm_cvtpi16_ps(a: input_vector[i]); | 
| 911 |         output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); | 
| 912 |       } | 
| 913 |     } else { | 
| 914 |       SimdVector temporary_float_vector; | 
| 915 |       for (size_t i = 0; i < GetNumChunks(length); ++i) { | 
| 916 |         temporary_float_vector = _mm_cvtpi16_ps(a: input_vector[i]); | 
| 917 |         temporary_float_vector = | 
| 918 |             SIMD_MULTIPLY(scaling_vector, temporary_float_vector); | 
| 919 |         _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: temporary_float_vector); | 
| 920 |       } | 
| 921 |     } | 
| 922 |     // There is no easy way to simply load the 16 bit ints so we dont have an | 
| 923 |     // |output_aligned| only case. | 
| 924 |     leftover_samples = GetLeftoverSamples(length); | 
| 925 |   } | 
| 926 |  | 
| 927 |   // The remainder. | 
| 928 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 929 |     output[i] = static_cast<float>(input[i]) * kFloatFromInt16; | 
| 930 |   } | 
| 931 | } | 
| 932 |  | 
| 933 | #else  // SIMD disabled or Windows build. | 
| 934 |  | 
| 935 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { | 
| 936 |   DCHECK(input); | 
| 937 |   DCHECK(output); | 
| 938 |  | 
| 939 |   float temp_float; | 
| 940 |   for (size_t i = 0; i < length; ++i) { | 
| 941 |     temp_float = input[i] * kInt16FromFloat; | 
| 942 |     temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float)); | 
| 943 |     output[i] = static_cast<int16_t>(temp_float); | 
| 944 |   } | 
| 945 | } | 
| 946 |  | 
| 947 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { | 
| 948 |   DCHECK(input); | 
| 949 |   DCHECK(output); | 
| 950 |  | 
| 951 |   for (size_t i = 0; i < length; ++i) { | 
| 952 |     output[i] = static_cast<float>(input[i]) * kFloatFromInt16; | 
| 953 |   } | 
| 954 | } | 
| 955 |  | 
| 956 | #endif  // SIMD_NEON | 
| 957 |  | 
| 958 | void InterleaveStereo(size_t length, const int16_t* channel_0, | 
| 959 |                       const int16_t* channel_1, int16_t* interleaved_buffer) { | 
| 960 |   DCHECK(interleaved_buffer); | 
| 961 |   DCHECK(channel_0); | 
| 962 |   DCHECK(channel_1); | 
| 963 |  | 
| 964 |   size_t leftover_samples = length; | 
| 965 | #ifdef SIMD_NEON | 
| 966 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 967 |       IsAligned(channel_1)) { | 
| 968 |     const int16x8_t* channel_0_vec = | 
| 969 |         reinterpret_cast<const int16x8_t*>(channel_0); | 
| 970 |     const int16x8_t* channel_1_vec = | 
| 971 |         reinterpret_cast<const int16x8_t*>(channel_1); | 
| 972 |  | 
| 973 |     const size_t num_chunks = length / kSixteenBitSimdLength; | 
| 974 |     int16x8x2_t interleaved_pair; | 
| 975 |  | 
| 976 |     int16x8_t* interleaved_vec = | 
| 977 |         reinterpret_cast<int16x8_t*>(interleaved_buffer); | 
| 978 |     for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) { | 
| 979 |       interleaved_pair = vzipq_s16(channel_0_vec[i], channel_1_vec[i]); | 
| 980 |       interleaved_vec[j] = interleaved_pair.val[0]; | 
| 981 |       interleaved_vec[j + 1] = interleaved_pair.val[1]; | 
| 982 |     } | 
| 983 |  | 
| 984 |     leftover_samples = length % kSixteenBitSimdLength; | 
| 985 |   } | 
| 986 | #endif  // SIMD_NEON | 
| 987 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 988 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 989 |     interleaved_buffer[interleaved_index] = channel_0[i]; | 
| 990 |     interleaved_buffer[interleaved_index + 1] = channel_1[i]; | 
| 991 |   } | 
| 992 | } | 
| 993 |  | 
| 994 | void InterleaveStereo(size_t length, const float* channel_0, | 
| 995 |                       const float* channel_1, float* interleaved_buffer) { | 
| 996 |   DCHECK(interleaved_buffer); | 
| 997 |   DCHECK(channel_0); | 
| 998 |   DCHECK(channel_1); | 
| 999 |  | 
| 1000 |   size_t leftover_samples = length; | 
| 1001 | #ifdef SIMD_NEON | 
| 1002 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 1003 |       IsAligned(channel_1)) { | 
| 1004 |     const SimdVector* channel_0_vec = | 
| 1005 |         reinterpret_cast<const SimdVector*>(channel_0); | 
| 1006 |     const SimdVector* channel_1_vec = | 
| 1007 |         reinterpret_cast<const SimdVector*>(channel_1); | 
| 1008 |  | 
| 1009 |     const size_t num_chunks = GetNumChunks(length); | 
| 1010 |     float32x4x2_t interleaved_pair; | 
| 1011 |  | 
| 1012 |     SimdVector* interleaved_vec = | 
| 1013 |         reinterpret_cast<SimdVector*>(interleaved_buffer); | 
| 1014 |     for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) { | 
| 1015 |       interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]); | 
| 1016 |       interleaved_vec[j] = interleaved_pair.val[0]; | 
| 1017 |       interleaved_vec[j + 1] = interleaved_pair.val[1]; | 
| 1018 |     } | 
| 1019 |  | 
| 1020 |     leftover_samples = GetLeftoverSamples(length); | 
| 1021 |   } | 
| 1022 | #endif  // SIMD_NEON | 
| 1023 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 1024 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 1025 |     interleaved_buffer[interleaved_index] = channel_0[i]; | 
| 1026 |     interleaved_buffer[interleaved_index + 1] = channel_1[i]; | 
| 1027 |   } | 
| 1028 | } | 
| 1029 |  | 
| 1030 | void InterleaveStereo(size_t length, const float* channel_0, | 
| 1031 |                       const float* channel_1, int16_t* interleaved_buffer) { | 
| 1032 |   DCHECK(interleaved_buffer); | 
| 1033 |   DCHECK(channel_0); | 
| 1034 |   DCHECK(channel_1); | 
| 1035 |  | 
| 1036 |   size_t leftover_samples = length; | 
| 1037 | #ifdef SIMD_NEON | 
| 1038 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 1039 |       IsAligned(channel_1)) { | 
| 1040 |     const SimdVector* channel_0_vec = | 
| 1041 |         reinterpret_cast<const SimdVector*>(channel_0); | 
| 1042 |     const SimdVector* channel_1_vec = | 
| 1043 |         reinterpret_cast<const SimdVector*>(channel_1); | 
| 1044 |  | 
| 1045 |     const size_t num_chunks = GetNumChunks(length); | 
| 1046 |     float32x4x2_t interleaved_pair; | 
| 1047 |     int32x4_t temporary_wide_vector; | 
| 1048 |  | 
| 1049 |     const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); | 
| 1050 |     const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min); | 
| 1051 |     const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max); | 
| 1052 |  | 
| 1053 |     int16x4_t* interleaved_vec = | 
| 1054 |         reinterpret_cast<int16x4_t*>(interleaved_buffer); | 
| 1055 |     for (size_t i = 0; i < num_chunks; ++i) { | 
| 1056 |       const size_t interleaved_index = kNumStereoChannels * i; | 
| 1057 |       interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]); | 
| 1058 |       interleaved_pair.val[0] = | 
| 1059 |           SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[0]); | 
| 1060 |       interleaved_pair.val[0] = vmaxq_f32(interleaved_pair.val[0], min_vector); | 
| 1061 |       interleaved_pair.val[0] = vminq_f32(interleaved_pair.val[0], max_vector); | 
| 1062 |       temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[0]); | 
| 1063 |       interleaved_vec[interleaved_index] = vqmovn_s32(temporary_wide_vector); | 
| 1064 |       interleaved_pair.val[1] = | 
| 1065 |           SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[1]); | 
| 1066 |       interleaved_pair.val[1] = vmaxq_f32(interleaved_pair.val[1], min_vector); | 
| 1067 |       interleaved_pair.val[1] = vminq_f32(interleaved_pair.val[1], max_vector); | 
| 1068 |       temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[1]); | 
| 1069 |       interleaved_vec[interleaved_index + 1] = | 
| 1070 |           vqmovn_s32(temporary_wide_vector); | 
| 1071 |     } | 
| 1072 |  | 
| 1073 |     leftover_samples = GetLeftoverSamples(length); | 
| 1074 |   } | 
| 1075 | #endif  // SIMD_NEON | 
| 1076 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 1077 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 1078 |     interleaved_buffer[interleaved_index] = static_cast<int16_t>(std::max( | 
| 1079 |         a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_0[i]))); | 
| 1080 |     interleaved_buffer[interleaved_index + 1] = static_cast<int16_t>(std::max( | 
| 1081 |         a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_1[i]))); | 
| 1082 |   } | 
| 1083 | } | 
| 1084 |  | 
| 1085 | void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer, | 
| 1086 |                         int16_t* channel_0, int16_t* channel_1) { | 
| 1087 |   DCHECK(interleaved_buffer); | 
| 1088 |   DCHECK(channel_0); | 
| 1089 |   DCHECK(channel_1); | 
| 1090 |  | 
| 1091 |   size_t leftover_samples = length; | 
| 1092 | #ifdef SIMD_NEON | 
| 1093 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 1094 |       IsAligned(channel_1)) { | 
| 1095 |     const size_t num_chunks = length / kSixteenBitSimdLength; | 
| 1096 |     leftover_samples = length % kSixteenBitSimdLength; | 
| 1097 |     int16x8_t* channel_0_vec = reinterpret_cast<int16x8_t*>(channel_0); | 
| 1098 |     int16x8_t* channel_1_vec = reinterpret_cast<int16x8_t*>(channel_1); | 
| 1099 |     int16x8x2_t deinterleaved_pair; | 
| 1100 |     const int16x8_t* interleaved_vec = | 
| 1101 |         reinterpret_cast<const int16x8_t*>(interleaved_buffer); | 
| 1102 |     for (size_t chunk = 0; chunk < num_chunks; ++chunk) { | 
| 1103 |       const size_t interleaved_index = chunk * kNumStereoChannels; | 
| 1104 |       deinterleaved_pair = vuzpq_s16(interleaved_vec[interleaved_index], | 
| 1105 |                                      interleaved_vec[interleaved_index + 1]); | 
| 1106 |       channel_0_vec[chunk] = deinterleaved_pair.val[0]; | 
| 1107 |       channel_1_vec[chunk] = deinterleaved_pair.val[1]; | 
| 1108 |     } | 
| 1109 |   } | 
| 1110 | #endif  // SIMD_NEON | 
| 1111 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 1112 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 1113 |     channel_0[i] = interleaved_buffer[interleaved_index]; | 
| 1114 |     channel_1[i] = interleaved_buffer[interleaved_index + 1]; | 
| 1115 |   } | 
| 1116 | } | 
| 1117 |  | 
| 1118 | void DeinterleaveStereo(size_t length, const float* interleaved_buffer, | 
| 1119 |                         float* channel_0, float* channel_1) { | 
| 1120 |   DCHECK(interleaved_buffer); | 
| 1121 |   DCHECK(channel_0); | 
| 1122 |   DCHECK(channel_1); | 
| 1123 |  | 
| 1124 |   size_t leftover_samples = length; | 
| 1125 | #ifdef SIMD_NEON | 
| 1126 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 1127 |       IsAligned(channel_1)) { | 
| 1128 |     const size_t num_chunks = GetNumChunks(length); | 
| 1129 |     leftover_samples = GetLeftoverSamples(length); | 
| 1130 |     SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0); | 
| 1131 |     SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1); | 
| 1132 |     float32x4x2_t deinterleaved_pair; | 
| 1133 |  | 
| 1134 |     const SimdVector* interleaved_vec = | 
| 1135 |         reinterpret_cast<const SimdVector*>(interleaved_buffer); | 
| 1136 |     for (size_t chunk = 0; chunk < num_chunks; ++chunk) { | 
| 1137 |       const size_t interleaved_index = chunk * kNumStereoChannels; | 
| 1138 |       deinterleaved_pair = vuzpq_f32(interleaved_vec[interleaved_index], | 
| 1139 |                                      interleaved_vec[interleaved_index + 1]); | 
| 1140 |       channel_0_vec[chunk] = deinterleaved_pair.val[0]; | 
| 1141 |       channel_1_vec[chunk] = deinterleaved_pair.val[1]; | 
| 1142 |     } | 
| 1143 |   } | 
| 1144 | #endif  // SIMD_NEON | 
| 1145 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 1146 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 1147 |     channel_0[i] = interleaved_buffer[interleaved_index]; | 
| 1148 |     channel_1[i] = interleaved_buffer[interleaved_index + 1]; | 
| 1149 |   } | 
| 1150 | } | 
| 1151 |  | 
| 1152 | void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer, | 
| 1153 |                         float* channel_0, float* channel_1) { | 
| 1154 |   DCHECK(interleaved_buffer); | 
| 1155 |   DCHECK(channel_0); | 
| 1156 |   DCHECK(channel_1); | 
| 1157 |  | 
| 1158 |   size_t leftover_samples = length; | 
| 1159 | #ifdef SIMD_NEON | 
| 1160 |   if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && | 
| 1161 |       IsAligned(channel_1)) { | 
| 1162 |     const size_t num_chunks = GetNumChunks(length); | 
| 1163 |     leftover_samples = GetLeftoverSamples(length); | 
| 1164 |     SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0); | 
| 1165 |     SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1); | 
| 1166 |     int16x4x2_t deinterleaved_pair; | 
| 1167 |     int32x4_t temporary_wide; | 
| 1168 |     const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); | 
| 1169 |  | 
| 1170 |     const int16x4_t* interleaved_vec = | 
| 1171 |         reinterpret_cast<const int16x4_t*>(interleaved_buffer); | 
| 1172 |     for (size_t chunk = 0; chunk < num_chunks; ++chunk) { | 
| 1173 |       const size_t interleaved_index = chunk * kNumStereoChannels; | 
| 1174 |       deinterleaved_pair = vuzp_s16(interleaved_vec[interleaved_index], | 
| 1175 |                                     interleaved_vec[interleaved_index + 1]); | 
| 1176 |       temporary_wide = vmovl_s16(deinterleaved_pair.val[0]); | 
| 1177 |       channel_0_vec[chunk] = vcvtq_f32_s32(temporary_wide); | 
| 1178 |       channel_0_vec[chunk] = | 
| 1179 |           SIMD_MULTIPLY(scaling_vector, channel_0_vec[chunk]); | 
| 1180 |       temporary_wide = vmovl_s16(deinterleaved_pair.val[1]); | 
| 1181 |       channel_1_vec[chunk] = vcvtq_f32_s32(temporary_wide); | 
| 1182 |       channel_1_vec[chunk] = | 
| 1183 |           SIMD_MULTIPLY(scaling_vector, channel_1_vec[chunk]); | 
| 1184 |     } | 
| 1185 |   } | 
| 1186 | #endif  // SIMD_NEON | 
| 1187 |   for (size_t i = length - leftover_samples; i < length; ++i) { | 
| 1188 |     const size_t interleaved_index = kNumStereoChannels * i; | 
| 1189 |     channel_0[i] = static_cast<float>(interleaved_buffer[interleaved_index]) * | 
| 1190 |                    kFloatFromInt16; | 
| 1191 |     channel_1[i] = | 
| 1192 |         static_cast<float>(interleaved_buffer[interleaved_index + 1]) * | 
| 1193 |         kFloatFromInt16; | 
| 1194 |   } | 
| 1195 | } | 
| 1196 |  | 
| 1197 | void InterleaveQuad(size_t length, const int16_t* channel_0, | 
| 1198 |                     const int16_t* channel_1, const int16_t* channel_2, | 
| 1199 |                     const int16_t* channel_3, int16_t* workspace, | 
| 1200 |                     int16_t* interleaved_buffer) { | 
| 1201 | #ifdef SIMD_NEON | 
| 1202 |   DCHECK(IsAligned(workspace)); | 
| 1203 |   const size_t double_length = length * 2; | 
| 1204 |   int16_t* workspace_half_point = | 
| 1205 |       workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t), | 
| 1206 |                                             kMemoryAlignmentBytes); | 
| 1207 |   InterleaveStereo(length, channel_0, channel_2, workspace); | 
| 1208 |   InterleaveStereo(length, channel_1, channel_3, workspace_half_point); | 
| 1209 |   InterleaveStereo(double_length, workspace, workspace_half_point, | 
| 1210 |                    interleaved_buffer); | 
| 1211 | #else | 
| 1212 |   for (size_t i = 0; i < length; ++i) { | 
| 1213 |     const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; | 
| 1214 |     interleaved_buffer[interleaved_index] = channel_0[i]; | 
| 1215 |     interleaved_buffer[interleaved_index + 1] = channel_1[i]; | 
| 1216 |     interleaved_buffer[interleaved_index + 2] = channel_2[i]; | 
| 1217 |     interleaved_buffer[interleaved_index + 3] = channel_3[i]; | 
| 1218 |   } | 
| 1219 | #endif  // SIMD_NEON | 
| 1220 | } | 
| 1221 |  | 
| 1222 | void InterleaveQuad(size_t length, const float* channel_0, | 
| 1223 |                     const float* channel_1, const float* channel_2, | 
| 1224 |                     const float* channel_3, float* workspace, | 
| 1225 |                     float* interleaved_buffer) { | 
| 1226 | #ifdef SIMD_NEON | 
| 1227 |   DCHECK(IsAligned(workspace)); | 
| 1228 |   const size_t double_length = length * 2; | 
| 1229 |   float* workspace_half_point = | 
| 1230 |       workspace + FindNextAlignedArrayIndex(double_length, sizeof(float), | 
| 1231 |                                             kMemoryAlignmentBytes); | 
| 1232 |   DCHECK(IsAligned(workspace_half_point)); | 
| 1233 |   InterleaveStereo(length, channel_0, channel_2, workspace); | 
| 1234 |   InterleaveStereo(length, channel_1, channel_3, workspace_half_point); | 
| 1235 |   InterleaveStereo(double_length, workspace, workspace_half_point, | 
| 1236 |                    interleaved_buffer); | 
| 1237 | #else | 
| 1238 |   for (size_t i = 0; i < length; ++i) { | 
| 1239 |     const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; | 
| 1240 |     interleaved_buffer[interleaved_index] = channel_0[i]; | 
| 1241 |     interleaved_buffer[interleaved_index + 1] = channel_1[i]; | 
| 1242 |     interleaved_buffer[interleaved_index + 2] = channel_2[i]; | 
| 1243 |     interleaved_buffer[interleaved_index + 3] = channel_3[i]; | 
| 1244 |   } | 
| 1245 | #endif  // SIMD_NEON | 
| 1246 | } | 
| 1247 |  | 
| 1248 | void DeinterleaveQuad(size_t length, const int16_t* interleaved_buffer, | 
| 1249 |                       int16_t* workspace, int16_t* channel_0, | 
| 1250 |                       int16_t* channel_1, int16_t* channel_2, | 
| 1251 |                       int16_t* channel_3) { | 
| 1252 | #ifdef SIMD_NEON | 
| 1253 |   DCHECK(IsAligned(workspace)); | 
| 1254 |   const size_t double_length = length * 2; | 
| 1255 |   int16_t* workspace_half_point = | 
| 1256 |       workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t), | 
| 1257 |                                             kMemoryAlignmentBytes); | 
| 1258 |   DCHECK(IsAligned(workspace_half_point)); | 
| 1259 |   DeinterleaveStereo(double_length, interleaved_buffer, workspace, | 
| 1260 |                      workspace_half_point); | 
| 1261 |   DeinterleaveStereo(length, workspace, channel_0, channel_2); | 
| 1262 |   DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3); | 
| 1263 | #else | 
| 1264 |   for (size_t i = 0; i < length; ++i) { | 
| 1265 |     const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; | 
| 1266 |     channel_0[i] = interleaved_buffer[interleaved_index]; | 
| 1267 |     channel_1[i] = interleaved_buffer[interleaved_index + 1]; | 
| 1268 |     channel_2[i] = interleaved_buffer[interleaved_index + 2]; | 
| 1269 |     channel_3[i] = interleaved_buffer[interleaved_index + 3]; | 
| 1270 |   } | 
| 1271 | #endif  // SIMD_NEON | 
| 1272 | } | 
| 1273 |  | 
| 1274 | void DeinterleaveQuad(size_t length, const float* interleaved_buffer, | 
| 1275 |                       float* workspace, float* channel_0, float* channel_1, | 
| 1276 |                       float* channel_2, float* channel_3) { | 
| 1277 | #ifdef SIMD_NEON | 
| 1278 |   DCHECK(IsAligned(workspace)); | 
| 1279 |   const size_t double_length = length * 2; | 
| 1280 |   float* workspace_half_point = | 
| 1281 |       workspace + FindNextAlignedArrayIndex(double_length, sizeof(float), | 
| 1282 |                                             kMemoryAlignmentBytes); | 
| 1283 |   DCHECK(IsAligned(workspace_half_point)); | 
| 1284 |   DeinterleaveStereo(double_length, interleaved_buffer, workspace, | 
| 1285 |                      workspace_half_point); | 
| 1286 |   DeinterleaveStereo(length, workspace, channel_0, channel_2); | 
| 1287 |   DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3); | 
| 1288 | #else | 
| 1289 |   for (size_t i = 0; i < length; ++i) { | 
| 1290 |     const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; | 
| 1291 |     channel_0[i] = interleaved_buffer[interleaved_index]; | 
| 1292 |     channel_1[i] = interleaved_buffer[interleaved_index + 1]; | 
| 1293 |     channel_2[i] = interleaved_buffer[interleaved_index + 2]; | 
| 1294 |     channel_3[i] = interleaved_buffer[interleaved_index + 3]; | 
| 1295 |   } | 
| 1296 | #endif  // SIMD_NEON | 
| 1297 | } | 
| 1298 |  | 
| 1299 | }  // namespace vraudio | 
| 1300 |  |