1 | /* |
2 | Copyright 2018 Google Inc. All Rights Reserved. |
3 | |
4 | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | you may not use this file except in compliance with the License. |
6 | You may obtain a copy of the License at |
7 | |
8 | http://www.apache.org/licenses/LICENSE-2.0 |
9 | |
10 | Unless required by applicable law or agreed to in writing, software |
11 | distributed under the License is distributed on an "AS-IS" BASIS, |
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | */ |
16 | |
17 | // Prevent Visual Studio from complaining about std::copy_n. |
18 | #if defined(_WIN32) |
19 | #define _SCL_SECURE_NO_WARNINGS |
20 | #endif |
21 | |
22 | #include "base/simd_utils.h" |
23 | |
24 | #include <algorithm> |
25 | #include <limits> |
26 | |
27 | #include "base/constants_and_types.h" |
28 | #include "base/logging.h" |
29 | #include "base/misc_math.h" |
30 | #include "base/simd_macros.h" |
31 | |
32 | |
33 | namespace vraudio { |
34 | |
35 | namespace { |
36 | |
37 | #ifdef SIMD_NEON |
38 | // Deinterleaving operates on 8 int16s at a time. |
39 | const size_t kSixteenBitSimdLength = SIMD_LENGTH * 2; |
40 | #endif // SIMD_NEON |
41 | |
42 | // Float format of max and min values storable in an int16_t, for clamping. |
43 | const float kInt16Max = static_cast<float>(0x7FFF); |
44 | const float kInt16Min = static_cast<float>(-0x7FFF); |
45 | |
46 | // Conversion factors between float and int16_t (both directions). |
47 | const float kFloatFromInt16 = 1.0f / kInt16Max; |
48 | const float kInt16FromFloat = kInt16Max; |
49 | |
50 | // Expected SIMD alignment in bytes. |
51 | const size_t kSimdSizeBytes = 16; |
52 | |
53 | inline size_t GetNumChunks(size_t length) { return length / SIMD_LENGTH; } |
54 | |
55 | inline size_t GetLeftoverSamples(size_t length) { return length % SIMD_LENGTH; } |
56 | |
57 | template <typename T> |
58 | inline bool IsAlignedTemplated(const T* pointer) { |
59 | return reinterpret_cast<uintptr_t>(pointer) % kSimdSizeBytes == 0; |
60 | } |
61 | |
62 | #ifdef SIMD_DISABLED |
63 | // Calculates the approximate complex magnude of z = real + i * imaginary. |
64 | inline void ComplexMagnitude(float real, float imaginary, float* output) { |
65 | *output = real * real + imaginary * imaginary; |
66 | // The value of |output| is not being recalculated, simply modified. |
67 | *output = 1.0f / FastReciprocalSqrt(*output); |
68 | } |
69 | #endif // defined(SIMD_DISABLED) |
70 | |
71 | } // namespace |
72 | |
73 | bool IsAligned(const float* pointer) { |
74 | return IsAlignedTemplated<float>(pointer); |
75 | } |
76 | |
77 | bool IsAligned(const int16_t* pointer) { |
78 | return IsAlignedTemplated<int16_t>(pointer); |
79 | } |
80 | |
81 | size_t FindNextAlignedArrayIndex(size_t length, size_t type_size_bytes, |
82 | size_t memory_alignment_bytes) { |
83 | const size_t byte_length = type_size_bytes * length; |
84 | const size_t unaligned_bytes = byte_length % memory_alignment_bytes; |
85 | const size_t bytes_to_next_aligned = |
86 | (unaligned_bytes == 0) ? 0 : memory_alignment_bytes - unaligned_bytes; |
87 | return (byte_length + bytes_to_next_aligned) / type_size_bytes; |
88 | } |
89 | |
90 | void AddPointwise(size_t length, const float* input_a, const float* input_b, |
91 | float* output) { |
92 | DCHECK(input_a); |
93 | DCHECK(input_b); |
94 | DCHECK(output); |
95 | |
96 | const SimdVector* input_a_vector = |
97 | reinterpret_cast<const SimdVector*>(input_a); |
98 | const SimdVector* input_b_vector = |
99 | reinterpret_cast<const SimdVector*>(input_b); |
100 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
101 | #ifdef SIMD_SSE |
102 | const size_t num_chunks = GetNumChunks(length); |
103 | const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); |
104 | const bool output_aligned = IsAligned(pointer: output); |
105 | if (inputs_aligned && output_aligned) { |
106 | for (size_t i = 0; i < num_chunks; ++i) { |
107 | output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]); |
108 | } |
109 | } else if (inputs_aligned) { |
110 | for (size_t i = 0; i < num_chunks; ++i) { |
111 | const SimdVector output_temp = |
112 | SIMD_ADD(input_a_vector[i], input_b_vector[i]); |
113 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
114 | } |
115 | } else if (output_aligned) { |
116 | for (size_t i = 0; i < num_chunks; ++i) { |
117 | const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); |
118 | const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); |
119 | output_vector[i] = SIMD_ADD(input_a_temp, input_b_temp); |
120 | } |
121 | } else { |
122 | for (size_t i = 0; i < num_chunks; ++i) { |
123 | const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); |
124 | const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); |
125 | const SimdVector output_temp = SIMD_ADD(input_a_temp, input_b_temp); |
126 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
127 | } |
128 | } |
129 | #else |
130 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
131 | output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]); |
132 | } |
133 | #endif // SIMD_SSE |
134 | |
135 | // Add samples at the end that were missed by the SIMD chunking. |
136 | const size_t leftover_samples = GetLeftoverSamples(length); |
137 | DCHECK_GE(length, leftover_samples); |
138 | for (size_t i = length - leftover_samples; i < length; ++i) { |
139 | output[i] = input_a[i] + input_b[i]; |
140 | } |
141 | } |
142 | |
143 | void SubtractPointwise(size_t length, const float* input_a, |
144 | const float* input_b, float* output) { |
145 | DCHECK(input_a); |
146 | DCHECK(input_b); |
147 | DCHECK(output); |
148 | |
149 | const SimdVector* input_a_vector = |
150 | reinterpret_cast<const SimdVector*>(input_a); |
151 | const SimdVector* input_b_vector = |
152 | reinterpret_cast<const SimdVector*>(input_b); |
153 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
154 | |
155 | #ifdef SIMD_SSE |
156 | const size_t num_chunks = GetNumChunks(length); |
157 | const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); |
158 | const bool output_aligned = IsAligned(pointer: output); |
159 | if (inputs_aligned && output_aligned) { |
160 | for (size_t i = 0; i < num_chunks; ++i) { |
161 | output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]); |
162 | } |
163 | } else if (inputs_aligned) { |
164 | for (size_t i = 0; i < num_chunks; ++i) { |
165 | const SimdVector output_temp = |
166 | SIMD_SUB(input_b_vector[i], input_a_vector[i]); |
167 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
168 | } |
169 | } else if (output_aligned) { |
170 | for (size_t i = 0; i < num_chunks; ++i) { |
171 | const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); |
172 | const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); |
173 | output_vector[i] = SIMD_SUB(input_b_temp, input_a_temp); |
174 | } |
175 | } else { |
176 | for (size_t i = 0; i < num_chunks; ++i) { |
177 | const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]); |
178 | const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]); |
179 | const SimdVector output_temp = SIMD_SUB(input_b_temp, input_a_temp); |
180 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
181 | } |
182 | } |
183 | #else |
184 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
185 | output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]); |
186 | } |
187 | #endif // SIMD_SSE |
188 | |
189 | // Subtract samples at the end that were missed by the SIMD chunking. |
190 | const size_t leftover_samples = GetLeftoverSamples(length); |
191 | DCHECK_GE(length, leftover_samples); |
192 | for (size_t i = length - leftover_samples; i < length; ++i) { |
193 | output[i] = input_b[i] - input_a[i]; |
194 | } |
195 | } |
196 | |
197 | void MultiplyPointwise(size_t length, const float* input_a, |
198 | const float* input_b, float* output) { |
199 | DCHECK(input_a); |
200 | DCHECK(input_b); |
201 | DCHECK(output); |
202 | |
203 | const SimdVector* input_a_vector = |
204 | reinterpret_cast<const SimdVector*>(input_a); |
205 | const SimdVector* input_b_vector = |
206 | reinterpret_cast<const SimdVector*>(input_b); |
207 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
208 | |
209 | #ifdef SIMD_SSE |
210 | const size_t num_chunks = GetNumChunks(length); |
211 | const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); |
212 | const bool output_aligned = IsAligned(pointer: output); |
213 | if (inputs_aligned && output_aligned) { |
214 | for (size_t i = 0; i < num_chunks; ++i) { |
215 | output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); |
216 | } |
217 | } else if (inputs_aligned) { |
218 | for (size_t i = 0; i < num_chunks; ++i) { |
219 | const SimdVector output_temp = |
220 | SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); |
221 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
222 | } |
223 | } else if (output_aligned) { |
224 | for (size_t i = 0; i < num_chunks; ++i) { |
225 | const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); |
226 | const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); |
227 | output_vector[i] = SIMD_MULTIPLY(input_a_temp, input_b_temp); |
228 | } |
229 | } else { |
230 | for (size_t i = 0; i < num_chunks; ++i) { |
231 | const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); |
232 | const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); |
233 | const SimdVector output_temp = SIMD_MULTIPLY(input_a_temp, input_b_temp); |
234 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
235 | } |
236 | } |
237 | #else |
238 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
239 | output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]); |
240 | } |
241 | #endif // SIMD_SSE |
242 | |
243 | // Multiply samples at the end that were missed by the SIMD chunking. |
244 | const size_t leftover_samples = GetLeftoverSamples(length); |
245 | DCHECK_GE(length, leftover_samples); |
246 | for (size_t i = length - leftover_samples; i < length; ++i) { |
247 | output[i] = input_a[i] * input_b[i]; |
248 | } |
249 | } |
250 | |
251 | void MultiplyAndAccumulatePointwise(size_t length, const float* input_a, |
252 | const float* input_b, float* accumulator) { |
253 | DCHECK(input_a); |
254 | DCHECK(input_b); |
255 | DCHECK(accumulator); |
256 | |
257 | const SimdVector* input_a_vector = |
258 | reinterpret_cast<const SimdVector*>(input_a); |
259 | const SimdVector* input_b_vector = |
260 | reinterpret_cast<const SimdVector*>(input_b); |
261 | SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator); |
262 | |
263 | #ifdef SIMD_SSE |
264 | const size_t num_chunks = GetNumChunks(length); |
265 | const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b); |
266 | const bool accumulator_aligned = IsAligned(pointer: accumulator); |
267 | if (inputs_aligned && accumulator_aligned) { |
268 | for (size_t i = 0; i < num_chunks; ++i) { |
269 | accumulator_vector[i] = SIMD_MULTIPLY_ADD( |
270 | input_a_vector[i], input_b_vector[i], accumulator_vector[i]); |
271 | } |
272 | } else if (inputs_aligned) { |
273 | for (size_t i = 0; i < num_chunks; ++i) { |
274 | SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); |
275 | accumulator_temp = SIMD_MULTIPLY_ADD(input_a_vector[i], input_b_vector[i], |
276 | accumulator_temp); |
277 | _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); |
278 | } |
279 | } else if (accumulator_aligned) { |
280 | for (size_t i = 0; i < num_chunks; ++i) { |
281 | const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); |
282 | const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); |
283 | accumulator_vector[i] = |
284 | SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_vector[i]); |
285 | } |
286 | } else { |
287 | for (size_t i = 0; i < num_chunks; ++i) { |
288 | const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]); |
289 | const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]); |
290 | SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); |
291 | accumulator_temp = |
292 | SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_temp); |
293 | _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); |
294 | } |
295 | } |
296 | #else |
297 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
298 | accumulator_vector[i] = SIMD_MULTIPLY_ADD( |
299 | input_a_vector[i], input_b_vector[i], accumulator_vector[i]); |
300 | } |
301 | #endif // SIMD_SSE |
302 | |
303 | // Apply gain and accumulate to samples at the end that were missed by the |
304 | // SIMD chunking. |
305 | const size_t leftover_samples = GetLeftoverSamples(length); |
306 | DCHECK_GE(length, leftover_samples); |
307 | for (size_t i = length - leftover_samples; i < length; ++i) { |
308 | accumulator[i] += input_a[i] * input_b[i]; |
309 | } |
310 | } |
311 | |
312 | void ScalarMultiply(size_t length, float gain, const float* input, |
313 | float* output) { |
314 | DCHECK(input); |
315 | DCHECK(output); |
316 | |
317 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
318 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
319 | |
320 | const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain); |
321 | #ifdef SIMD_SSE |
322 | const size_t num_chunks = GetNumChunks(length); |
323 | const bool input_aligned = IsAligned(pointer: input); |
324 | const bool output_aligned = IsAligned(pointer: output); |
325 | if (input_aligned && output_aligned) { |
326 | for (size_t i = 0; i < num_chunks; ++i) { |
327 | output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]); |
328 | } |
329 | } else if (input_aligned) { |
330 | for (size_t i = 0; i < num_chunks; ++i) { |
331 | const SimdVector output_temp = |
332 | SIMD_MULTIPLY(gain_vector, input_vector[i]); |
333 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
334 | } |
335 | } else if (output_aligned) { |
336 | for (size_t i = 0; i < num_chunks; ++i) { |
337 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
338 | output_vector[i] = SIMD_MULTIPLY(gain_vector, input_temp); |
339 | } |
340 | } else { |
341 | for (size_t i = 0; i < num_chunks; ++i) { |
342 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
343 | const SimdVector output_temp = SIMD_MULTIPLY(gain_vector, input_temp); |
344 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
345 | } |
346 | } |
347 | #else |
348 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
349 | output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]); |
350 | } |
351 | #endif // SIMD_SSE |
352 | |
353 | // Apply gain to samples at the end that were missed by the SIMD chunking. |
354 | const size_t leftover_samples = GetLeftoverSamples(length); |
355 | DCHECK_GE(length, leftover_samples); |
356 | for (size_t i = length - leftover_samples; i < length; ++i) { |
357 | output[i] = input[i] * gain; |
358 | } |
359 | } |
360 | |
361 | void ScalarMultiplyAndAccumulate(size_t length, float gain, const float* input, |
362 | float* accumulator) { |
363 | DCHECK(input); |
364 | DCHECK(accumulator); |
365 | |
366 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
367 | SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator); |
368 | |
369 | const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain); |
370 | #ifdef SIMD_SSE |
371 | const size_t num_chunks = GetNumChunks(length); |
372 | const bool input_aligned = IsAligned(pointer: input); |
373 | const bool accumulator_aligned = IsAligned(pointer: accumulator); |
374 | if (input_aligned && accumulator_aligned) { |
375 | for (size_t i = 0; i < num_chunks; ++i) { |
376 | accumulator_vector[i] = SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], |
377 | accumulator_vector[i]); |
378 | } |
379 | } else if (input_aligned) { |
380 | for (size_t i = 0; i < num_chunks; ++i) { |
381 | SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); |
382 | accumulator_temp = |
383 | SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_temp); |
384 | _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); |
385 | } |
386 | } else if (accumulator_aligned) { |
387 | for (size_t i = 0; i < num_chunks; ++i) { |
388 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
389 | accumulator_vector[i] = |
390 | SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_vector[i]); |
391 | } |
392 | } else { |
393 | for (size_t i = 0; i < num_chunks; ++i) { |
394 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
395 | SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]); |
396 | accumulator_temp = |
397 | SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_temp); |
398 | _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp); |
399 | } |
400 | } |
401 | #else |
402 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
403 | accumulator_vector[i] = |
404 | SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_vector[i]); |
405 | } |
406 | #endif // SIMD_SSE |
407 | |
408 | // Apply gain and accumulate to samples at the end that were missed by the |
409 | // SIMD chunking. |
410 | const size_t leftover_samples = GetLeftoverSamples(length); |
411 | DCHECK_GE(length, leftover_samples); |
412 | for (size_t i = length - leftover_samples; i < length; ++i) { |
413 | accumulator[i] += input[i] * gain; |
414 | } |
415 | } |
416 | |
417 | void ReciprocalSqrt(size_t length, const float* input, float* output) { |
418 | DCHECK(input); |
419 | DCHECK(output); |
420 | |
421 | #if !defined(SIMD_DISABLED) |
422 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
423 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
424 | #endif // !defined(SIMD_DISABLED) |
425 | |
426 | #ifdef SIMD_SSE |
427 | const size_t num_chunks = GetNumChunks(length); |
428 | const bool input_aligned = IsAligned(pointer: input); |
429 | const bool output_aligned = IsAligned(pointer: output); |
430 | if (input_aligned && output_aligned) { |
431 | for (size_t i = 0; i < num_chunks; ++i) { |
432 | output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]); |
433 | } |
434 | } else if (input_aligned) { |
435 | for (size_t i = 0; i < num_chunks; ++i) { |
436 | const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_vector[i]); |
437 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
438 | } |
439 | } else if (output_aligned) { |
440 | for (size_t i = 0; i < num_chunks; ++i) { |
441 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
442 | output_vector[i] = SIMD_RECIPROCAL_SQRT(input_temp); |
443 | } |
444 | } else { |
445 | for (size_t i = 0; i < num_chunks; ++i) { |
446 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
447 | const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_temp); |
448 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
449 | } |
450 | } |
451 | #elif defined SIMD_NEON |
452 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
453 | output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]); |
454 | } |
455 | #endif // SIMD_SSE |
456 | |
457 | // Apply to samples at the end that were missed by the SIMD chunking. |
458 | const size_t leftover_samples = GetLeftoverSamples(length); |
459 | DCHECK_GE(length, leftover_samples); |
460 | for (size_t i = length - leftover_samples; i < length; ++i) { |
461 | output[i] = FastReciprocalSqrt(input: input[i]); |
462 | } |
463 | } |
464 | |
465 | void Sqrt(size_t length, const float* input, float* output) { |
466 | DCHECK(input); |
467 | DCHECK(output); |
468 | |
469 | #if !defined(SIMD_DISABLED) |
470 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
471 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
472 | #endif // !defined(SIMD_DISABLED) |
473 | |
474 | #ifdef SIMD_SSE |
475 | const size_t num_chunks = GetNumChunks(length); |
476 | const bool input_aligned = IsAligned(pointer: input); |
477 | const bool output_aligned = IsAligned(pointer: output); |
478 | if (input_aligned && output_aligned) { |
479 | for (size_t i = 0; i < num_chunks; ++i) { |
480 | output_vector[i] = SIMD_SQRT(input_vector[i]); |
481 | } |
482 | } else if (input_aligned) { |
483 | for (size_t i = 0; i < num_chunks; ++i) { |
484 | const SimdVector output_temp = SIMD_SQRT(input_vector[i]); |
485 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
486 | } |
487 | } else if (output_aligned) { |
488 | for (size_t i = 0; i < num_chunks; ++i) { |
489 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
490 | output_vector[i] = SIMD_SQRT(input_temp); |
491 | } |
492 | } else { |
493 | for (size_t i = 0; i < num_chunks; ++i) { |
494 | const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
495 | const SimdVector output_temp = SIMD_SQRT(input_temp); |
496 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp); |
497 | } |
498 | } |
499 | #elif defined SIMD_NEON |
500 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
501 | // This should be faster than using a sqrt method : https://goo.gl/XRKwFp |
502 | output_vector[i] = SIMD_SQRT(input_vector[i]); |
503 | } |
504 | #endif // SIMD_SSE |
505 | |
506 | // Apply to samples at the end that were missed by the SIMD chunking. |
507 | const size_t leftover_samples = GetLeftoverSamples(length); |
508 | DCHECK_GE(length, leftover_samples); |
509 | for (size_t i = length - leftover_samples; i < length; ++i) { |
510 | output[i] = 1.0f / FastReciprocalSqrt(input: input[i]); |
511 | } |
512 | } |
513 | |
514 | void ApproxComplexMagnitude(size_t length, const float* input, float* output) { |
515 | DCHECK(input); |
516 | DCHECK(output); |
517 | |
518 | #if !defined(SIMD_DISABLED) |
519 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
520 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
521 | const size_t num_chunks = GetNumChunks(length); |
522 | const bool input_aligned = IsAligned(pointer: input); |
523 | const bool output_aligned = IsAligned(pointer: output); |
524 | #endif // !defined(SIMD_DISABLED) |
525 | |
526 | #ifdef SIMD_SSE |
527 | if (input_aligned && output_aligned) { |
528 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
529 | const size_t first_index = out_index * 2; |
530 | const size_t second_index = first_index + 1; |
531 | const SimdVector squared_1 = |
532 | SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); |
533 | const SimdVector squared_2 = |
534 | SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); |
535 | const SimdVector unshuffled_1 = |
536 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); |
537 | const SimdVector unshuffled_2 = |
538 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); |
539 | output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2); |
540 | output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); |
541 | } |
542 | } else if (input_aligned) { |
543 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
544 | const size_t first_index = out_index * 2; |
545 | const size_t second_index = first_index + 1; |
546 | const SimdVector squared_1 = |
547 | SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); |
548 | const SimdVector squared_2 = |
549 | SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); |
550 | const SimdVector unshuffled_1 = |
551 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); |
552 | const SimdVector unshuffled_2 = |
553 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); |
554 | SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2); |
555 | output_vector[out_index] = SIMD_SQRT(output_temp); |
556 | _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp); |
557 | } |
558 | } else if (output_aligned) { |
559 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
560 | const size_t first_index = out_index * 2; |
561 | const size_t second_index = first_index + 1; |
562 | const SimdVector first_temp = |
563 | _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]); |
564 | const SimdVector second_temp = |
565 | _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]); |
566 | const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); |
567 | const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); |
568 | const SimdVector unshuffled_1 = |
569 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); |
570 | const SimdVector unshuffled_2 = |
571 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); |
572 | output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2); |
573 | output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); |
574 | } |
575 | } else { |
576 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
577 | const size_t first_index = out_index * 2; |
578 | const size_t second_index = first_index + 1; |
579 | const SimdVector first_temp = |
580 | _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]); |
581 | const SimdVector second_temp = |
582 | _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]); |
583 | const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); |
584 | const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); |
585 | const SimdVector unshuffled_1 = |
586 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0)); |
587 | const SimdVector unshuffled_2 = |
588 | _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1)); |
589 | SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2); |
590 | output_temp = SIMD_SQRT(output_temp); |
591 | _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp); |
592 | } |
593 | } |
594 | #elif defined SIMD_NEON |
595 | if (input_aligned && output_aligned) { |
596 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
597 | const size_t first_index = out_index * 2; |
598 | const size_t second_index = first_index + 1; |
599 | const SimdVector squared_1 = |
600 | SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); |
601 | const SimdVector squared_2 = |
602 | SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); |
603 | const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); |
604 | output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); |
605 | output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); |
606 | } |
607 | } else if (input_aligned) { |
608 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
609 | const size_t first_index = out_index * 2; |
610 | const size_t second_index = first_index + 1; |
611 | const SimdVector squared_1 = |
612 | SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]); |
613 | const SimdVector squared_2 = |
614 | SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]); |
615 | const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); |
616 | SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); |
617 | output_temp = SIMD_SQRT(output_temp); |
618 | vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp); |
619 | } |
620 | } else if (output_aligned) { |
621 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
622 | const size_t first_index = out_index * 2; |
623 | const size_t second_index = first_index + 1; |
624 | const SimdVector first_temp = |
625 | vld1q_f32(&input[first_index * SIMD_LENGTH]); |
626 | const SimdVector second_temp = |
627 | vld1q_f32(&input[second_index * SIMD_LENGTH]); |
628 | const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); |
629 | const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); |
630 | const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); |
631 | output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); |
632 | output_vector[out_index] = SIMD_SQRT(output_vector[out_index]); |
633 | } |
634 | } else { |
635 | for (size_t out_index = 0; out_index < num_chunks; ++out_index) { |
636 | const size_t first_index = out_index * 2; |
637 | const size_t second_index = first_index + 1; |
638 | const SimdVector first_temp = |
639 | vld1q_f32(&input[first_index * SIMD_LENGTH]); |
640 | const SimdVector second_temp = |
641 | vld1q_f32(&input[second_index * SIMD_LENGTH]); |
642 | const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp); |
643 | const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp); |
644 | const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2); |
645 | SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]); |
646 | output_temp = SIMD_SQRT(output_temp); |
647 | vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp); |
648 | } |
649 | } |
650 | #endif // SIMD_SSE |
651 | |
652 | // Apply to samples at the end that were missed by the SIMD chunking. |
653 | const size_t leftover_samples = GetLeftoverSamples(length); |
654 | DCHECK_GE(length, leftover_samples); |
655 | for (size_t i = length - leftover_samples; i < length; ++i) { |
656 | const size_t real_index = i * 2; |
657 | const size_t imag_index = real_index + 1; |
658 | const float squared_sum = (input[real_index] * input[real_index]) + |
659 | (input[imag_index] * input[imag_index]); |
660 | output[i] = 1.0f / FastReciprocalSqrt(input: squared_sum); |
661 | } |
662 | } |
663 | |
664 | void ComplexInterleavedFormatFromMagnitudeAndSinCosPhase( |
665 | size_t length, const float* magnitude, const float* cos_phase, |
666 | const float* sin_phase, float* complex_interleaved_format_output) { |
667 | size_t leftover_samples = 0; |
668 | #ifdef SIMD_NEON |
669 | if (IsAligned(complex_interleaved_format_output) && IsAligned(cos_phase) && |
670 | IsAligned(sin_phase) && IsAligned(magnitude)) { |
671 | const SimdVector* cos_vec = reinterpret_cast<const SimdVector*>(cos_phase); |
672 | const SimdVector* sin_vec = reinterpret_cast<const SimdVector*>(sin_phase); |
673 | const SimdVector* magnitude_vec = |
674 | reinterpret_cast<const SimdVector*>(magnitude); |
675 | |
676 | const size_t num_chunks = GetNumChunks(length); |
677 | float32x4x2_t interleaved_pair; |
678 | |
679 | SimdVector* interleaved_vec = |
680 | reinterpret_cast<SimdVector*>(complex_interleaved_format_output); |
681 | for (size_t i = 0, j = 0; j < num_chunks; ++i, j += 2) { |
682 | interleaved_pair = vzipq_f32(cos_vec[i], sin_vec[i]); |
683 | interleaved_vec[j] = |
684 | SIMD_MULTIPLY(interleaved_pair.val[0], magnitude_vec[i]); |
685 | interleaved_vec[j + 1] = |
686 | SIMD_MULTIPLY(interleaved_pair.val[1], magnitude_vec[i]); |
687 | } |
688 | |
689 | leftover_samples = GetLeftoverSamples(length); |
690 | } |
691 | #endif // SIMD_NEON |
692 | DCHECK_EQ(leftover_samples % 2U, 0U); |
693 | for (size_t i = leftover_samples, j = leftover_samples / 2; i < length; |
694 | i += 2, ++j) { |
695 | const size_t imaginary_offset = i + 1; |
696 | complex_interleaved_format_output[i] = magnitude[j] * cos_phase[j]; |
697 | complex_interleaved_format_output[imaginary_offset] = |
698 | magnitude[j] * sin_phase[j]; |
699 | } |
700 | } |
701 | |
702 | void StereoFromMonoSimd(size_t length, const float* mono, float* left, |
703 | float* right) { |
704 | ScalarMultiply(length, gain: kInverseSqrtTwo, input: mono, output: left); |
705 | std::copy_n(first: left, n: length, result: right); |
706 | } |
707 | |
708 | void MonoFromStereoSimd(size_t length, const float* left, const float* right, |
709 | float* mono) { |
710 | DCHECK(left); |
711 | DCHECK(right); |
712 | DCHECK(mono); |
713 | |
714 | const SimdVector* left_vector = reinterpret_cast<const SimdVector*>(left); |
715 | const SimdVector* right_vector = reinterpret_cast<const SimdVector*>(right); |
716 | SimdVector* mono_vector = reinterpret_cast<SimdVector*>(mono); |
717 | |
718 | const SimdVector inv_root_two_vec = SIMD_LOAD_ONE_FLOAT(kInverseSqrtTwo); |
719 | #ifdef SIMD_SSE |
720 | const size_t num_chunks = GetNumChunks(length); |
721 | const bool inputs_aligned = IsAligned(pointer: left) && IsAligned(pointer: right); |
722 | const bool mono_aligned = IsAligned(pointer: mono); |
723 | if (inputs_aligned && mono_aligned) { |
724 | for (size_t i = 0; i < num_chunks; ++i) { |
725 | mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec, |
726 | SIMD_ADD(left_vector[i], right_vector[i])); |
727 | } |
728 | } else if (inputs_aligned) { |
729 | for (size_t i = 0; i < num_chunks; ++i) { |
730 | const SimdVector mono_temp = SIMD_MULTIPLY( |
731 | inv_root_two_vec, SIMD_ADD(left_vector[i], right_vector[i])); |
732 | _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp); |
733 | } |
734 | } else if (mono_aligned) { |
735 | for (size_t i = 0; i < num_chunks; ++i) { |
736 | const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]); |
737 | const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]); |
738 | mono_vector[i] = |
739 | SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp)); |
740 | } |
741 | } else { |
742 | for (size_t i = 0; i < num_chunks; ++i) { |
743 | const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]); |
744 | const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]); |
745 | const SimdVector mono_temp = |
746 | SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp)); |
747 | _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp); |
748 | } |
749 | } |
750 | #else |
751 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
752 | mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec, |
753 | SIMD_ADD(left_vector[i], right_vector[i])); |
754 | } |
755 | #endif // SIMD_SSE |
756 | const size_t leftover_samples = GetLeftoverSamples(length); |
757 | // Downmix samples at the end that were missed by the SIMD chunking. |
758 | DCHECK_GE(length, leftover_samples); |
759 | for (size_t i = length - leftover_samples; i < length; ++i) { |
760 | mono[i] = kInverseSqrtTwo * (left[i] + right[i]); |
761 | } |
762 | } |
763 | |
764 | #ifdef SIMD_NEON |
765 | |
766 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { |
767 | DCHECK(input); |
768 | DCHECK(output); |
769 | |
770 | // if (input_aligned || output_aligned) { |
771 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
772 | int16x4_t* output_vector = reinterpret_cast<int16x4_t*>(output); |
773 | |
774 | // A temporary 32 bit integer vector is needed as we only have intrinsics to |
775 | // convert from 32 bit floats to 32 bit ints. Then truncate to 16 bit ints. |
776 | int32x4_t temporary_wide_vector; |
777 | SimdVector temporary_float_vector; |
778 | |
779 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); |
780 | |
781 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
782 | temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]); |
783 | temporary_wide_vector = vcvtq_s32_f32(temporary_float_vector); |
784 | output_vector[i] = vqmovn_s32(temporary_wide_vector); |
785 | } |
786 | |
787 | // The remainder. |
788 | const size_t leftover_samples = GetLeftoverSamples(length); |
789 | DCHECK_GE(length, leftover_samples); |
790 | float temp_float; |
791 | for (size_t i = length - leftover_samples; i < length; ++i) { |
792 | temp_float = input[i] * kInt16FromFloat; |
793 | temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float)); |
794 | output[i] = static_cast<int16_t>(temp_float); |
795 | } |
796 | } |
797 | |
798 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { |
799 | DCHECK(input); |
800 | DCHECK(output); |
801 | |
802 | size_t leftover_samples = length; |
803 | const bool input_aligned = IsAligned(input); |
804 | const bool output_aligned = IsAligned(output); |
805 | if (input_aligned || output_aligned) { |
806 | const int16x4_t* input_vector = reinterpret_cast<const int16x4_t*>(input); |
807 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
808 | |
809 | int16x4_t temporary_narrow_vector; |
810 | SimdVector temporary_float_vector; |
811 | int32x4_t temporary_wide_vector; |
812 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); |
813 | |
814 | if (input_aligned && output_aligned) { |
815 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
816 | temporary_wide_vector = vmovl_s16(input_vector[i]); |
817 | output_vector[i] = vcvtq_f32_s32(temporary_wide_vector); |
818 | output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); |
819 | } |
820 | } else if (input_aligned) { |
821 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
822 | temporary_wide_vector = vmovl_s16(input_vector[i]); |
823 | temporary_float_vector = vcvtq_f32_s32(temporary_wide_vector); |
824 | temporary_float_vector = |
825 | SIMD_MULTIPLY(scaling_vector, temporary_float_vector); |
826 | vst1q_f32(&output[i * SIMD_LENGTH], temporary_float_vector); |
827 | } |
828 | } else { |
829 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
830 | temporary_narrow_vector = vld1_s16(&input[i * SIMD_LENGTH]); |
831 | temporary_wide_vector = vmovl_s16(temporary_narrow_vector); |
832 | output_vector[i] = vcvtq_f32_s32(temporary_wide_vector); |
833 | output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); |
834 | } |
835 | } |
836 | leftover_samples = GetLeftoverSamples(length); |
837 | } |
838 | |
839 | // The remainder. |
840 | for (size_t i = length - leftover_samples; i < length; ++i) { |
841 | output[i] = static_cast<float>(input[i]) * kFloatFromInt16; |
842 | } |
843 | } |
844 | |
845 | #elif (defined SIMD_SSE && !defined(_MSC_VER)) |
846 | |
847 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { |
848 | DCHECK(input); |
849 | DCHECK(output); |
850 | |
851 | size_t leftover_samples = length; |
852 | const bool input_aligned = IsAligned(pointer: input); |
853 | const bool output_aligned = IsAligned(pointer: output); |
854 | if (output_aligned) { |
855 | const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input); |
856 | __m64* output_vector = reinterpret_cast<__m64*>(output); |
857 | |
858 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); |
859 | const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min); |
860 | const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max); |
861 | |
862 | SimdVector temporary_float_vector; |
863 | |
864 | if (input_aligned) { |
865 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
866 | temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]); |
867 | temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector); |
868 | temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector); |
869 | output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector); |
870 | } |
871 | } else { |
872 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
873 | temporary_float_vector = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]); |
874 | temporary_float_vector = |
875 | SIMD_MULTIPLY(scaling_vector, temporary_float_vector); |
876 | temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector); |
877 | temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector); |
878 | output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector); |
879 | } |
880 | } |
881 | // There is no easy way to simply store the 16 bit ints so we dont have an |
882 | // |input_aligned| only case. |
883 | leftover_samples = GetLeftoverSamples(length); |
884 | } |
885 | |
886 | // The remainder. |
887 | float temp_float; |
888 | for (size_t i = length - GetLeftoverSamples(length); i < length; ++i) { |
889 | temp_float = input[i] * kInt16FromFloat; |
890 | temp_float = std::min(a: kInt16Max, b: std::max(a: kInt16Min, b: temp_float)); |
891 | output[i] = static_cast<int16_t>(temp_float); |
892 | } |
893 | } |
894 | |
895 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { |
896 | DCHECK(input); |
897 | DCHECK(output); |
898 | |
899 | size_t leftover_samples = length; |
900 | const bool input_aligned = IsAligned(pointer: input); |
901 | const bool output_aligned = IsAligned(pointer: output); |
902 | if (input_aligned) { |
903 | SimdVector* output_vector = reinterpret_cast<SimdVector*>(output); |
904 | const __m64* input_vector = reinterpret_cast<const __m64*>(input); |
905 | |
906 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); |
907 | |
908 | if (output_aligned) { |
909 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
910 | output_vector[i] = _mm_cvtpi16_ps(a: input_vector[i]); |
911 | output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]); |
912 | } |
913 | } else { |
914 | SimdVector temporary_float_vector; |
915 | for (size_t i = 0; i < GetNumChunks(length); ++i) { |
916 | temporary_float_vector = _mm_cvtpi16_ps(a: input_vector[i]); |
917 | temporary_float_vector = |
918 | SIMD_MULTIPLY(scaling_vector, temporary_float_vector); |
919 | _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: temporary_float_vector); |
920 | } |
921 | } |
922 | // There is no easy way to simply load the 16 bit ints so we dont have an |
923 | // |output_aligned| only case. |
924 | leftover_samples = GetLeftoverSamples(length); |
925 | } |
926 | |
927 | // The remainder. |
928 | for (size_t i = length - leftover_samples; i < length; ++i) { |
929 | output[i] = static_cast<float>(input[i]) * kFloatFromInt16; |
930 | } |
931 | } |
932 | |
933 | #else // SIMD disabled or Windows build. |
934 | |
935 | void Int16FromFloat(size_t length, const float* input, int16_t* output) { |
936 | DCHECK(input); |
937 | DCHECK(output); |
938 | |
939 | float temp_float; |
940 | for (size_t i = 0; i < length; ++i) { |
941 | temp_float = input[i] * kInt16FromFloat; |
942 | temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float)); |
943 | output[i] = static_cast<int16_t>(temp_float); |
944 | } |
945 | } |
946 | |
947 | void FloatFromInt16(size_t length, const int16_t* input, float* output) { |
948 | DCHECK(input); |
949 | DCHECK(output); |
950 | |
951 | for (size_t i = 0; i < length; ++i) { |
952 | output[i] = static_cast<float>(input[i]) * kFloatFromInt16; |
953 | } |
954 | } |
955 | |
956 | #endif // SIMD_NEON |
957 | |
958 | void InterleaveStereo(size_t length, const int16_t* channel_0, |
959 | const int16_t* channel_1, int16_t* interleaved_buffer) { |
960 | DCHECK(interleaved_buffer); |
961 | DCHECK(channel_0); |
962 | DCHECK(channel_1); |
963 | |
964 | size_t leftover_samples = length; |
965 | #ifdef SIMD_NEON |
966 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
967 | IsAligned(channel_1)) { |
968 | const int16x8_t* channel_0_vec = |
969 | reinterpret_cast<const int16x8_t*>(channel_0); |
970 | const int16x8_t* channel_1_vec = |
971 | reinterpret_cast<const int16x8_t*>(channel_1); |
972 | |
973 | const size_t num_chunks = length / kSixteenBitSimdLength; |
974 | int16x8x2_t interleaved_pair; |
975 | |
976 | int16x8_t* interleaved_vec = |
977 | reinterpret_cast<int16x8_t*>(interleaved_buffer); |
978 | for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) { |
979 | interleaved_pair = vzipq_s16(channel_0_vec[i], channel_1_vec[i]); |
980 | interleaved_vec[j] = interleaved_pair.val[0]; |
981 | interleaved_vec[j + 1] = interleaved_pair.val[1]; |
982 | } |
983 | |
984 | leftover_samples = length % kSixteenBitSimdLength; |
985 | } |
986 | #endif // SIMD_NEON |
987 | for (size_t i = length - leftover_samples; i < length; ++i) { |
988 | const size_t interleaved_index = kNumStereoChannels * i; |
989 | interleaved_buffer[interleaved_index] = channel_0[i]; |
990 | interleaved_buffer[interleaved_index + 1] = channel_1[i]; |
991 | } |
992 | } |
993 | |
994 | void InterleaveStereo(size_t length, const float* channel_0, |
995 | const float* channel_1, float* interleaved_buffer) { |
996 | DCHECK(interleaved_buffer); |
997 | DCHECK(channel_0); |
998 | DCHECK(channel_1); |
999 | |
1000 | size_t leftover_samples = length; |
1001 | #ifdef SIMD_NEON |
1002 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
1003 | IsAligned(channel_1)) { |
1004 | const SimdVector* channel_0_vec = |
1005 | reinterpret_cast<const SimdVector*>(channel_0); |
1006 | const SimdVector* channel_1_vec = |
1007 | reinterpret_cast<const SimdVector*>(channel_1); |
1008 | |
1009 | const size_t num_chunks = GetNumChunks(length); |
1010 | float32x4x2_t interleaved_pair; |
1011 | |
1012 | SimdVector* interleaved_vec = |
1013 | reinterpret_cast<SimdVector*>(interleaved_buffer); |
1014 | for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) { |
1015 | interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]); |
1016 | interleaved_vec[j] = interleaved_pair.val[0]; |
1017 | interleaved_vec[j + 1] = interleaved_pair.val[1]; |
1018 | } |
1019 | |
1020 | leftover_samples = GetLeftoverSamples(length); |
1021 | } |
1022 | #endif // SIMD_NEON |
1023 | for (size_t i = length - leftover_samples; i < length; ++i) { |
1024 | const size_t interleaved_index = kNumStereoChannels * i; |
1025 | interleaved_buffer[interleaved_index] = channel_0[i]; |
1026 | interleaved_buffer[interleaved_index + 1] = channel_1[i]; |
1027 | } |
1028 | } |
1029 | |
1030 | void InterleaveStereo(size_t length, const float* channel_0, |
1031 | const float* channel_1, int16_t* interleaved_buffer) { |
1032 | DCHECK(interleaved_buffer); |
1033 | DCHECK(channel_0); |
1034 | DCHECK(channel_1); |
1035 | |
1036 | size_t leftover_samples = length; |
1037 | #ifdef SIMD_NEON |
1038 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
1039 | IsAligned(channel_1)) { |
1040 | const SimdVector* channel_0_vec = |
1041 | reinterpret_cast<const SimdVector*>(channel_0); |
1042 | const SimdVector* channel_1_vec = |
1043 | reinterpret_cast<const SimdVector*>(channel_1); |
1044 | |
1045 | const size_t num_chunks = GetNumChunks(length); |
1046 | float32x4x2_t interleaved_pair; |
1047 | int32x4_t temporary_wide_vector; |
1048 | |
1049 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat); |
1050 | const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min); |
1051 | const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max); |
1052 | |
1053 | int16x4_t* interleaved_vec = |
1054 | reinterpret_cast<int16x4_t*>(interleaved_buffer); |
1055 | for (size_t i = 0; i < num_chunks; ++i) { |
1056 | const size_t interleaved_index = kNumStereoChannels * i; |
1057 | interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]); |
1058 | interleaved_pair.val[0] = |
1059 | SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[0]); |
1060 | interleaved_pair.val[0] = vmaxq_f32(interleaved_pair.val[0], min_vector); |
1061 | interleaved_pair.val[0] = vminq_f32(interleaved_pair.val[0], max_vector); |
1062 | temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[0]); |
1063 | interleaved_vec[interleaved_index] = vqmovn_s32(temporary_wide_vector); |
1064 | interleaved_pair.val[1] = |
1065 | SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[1]); |
1066 | interleaved_pair.val[1] = vmaxq_f32(interleaved_pair.val[1], min_vector); |
1067 | interleaved_pair.val[1] = vminq_f32(interleaved_pair.val[1], max_vector); |
1068 | temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[1]); |
1069 | interleaved_vec[interleaved_index + 1] = |
1070 | vqmovn_s32(temporary_wide_vector); |
1071 | } |
1072 | |
1073 | leftover_samples = GetLeftoverSamples(length); |
1074 | } |
1075 | #endif // SIMD_NEON |
1076 | for (size_t i = length - leftover_samples; i < length; ++i) { |
1077 | const size_t interleaved_index = kNumStereoChannels * i; |
1078 | interleaved_buffer[interleaved_index] = static_cast<int16_t>(std::max( |
1079 | a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_0[i]))); |
1080 | interleaved_buffer[interleaved_index + 1] = static_cast<int16_t>(std::max( |
1081 | a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_1[i]))); |
1082 | } |
1083 | } |
1084 | |
1085 | void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer, |
1086 | int16_t* channel_0, int16_t* channel_1) { |
1087 | DCHECK(interleaved_buffer); |
1088 | DCHECK(channel_0); |
1089 | DCHECK(channel_1); |
1090 | |
1091 | size_t leftover_samples = length; |
1092 | #ifdef SIMD_NEON |
1093 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
1094 | IsAligned(channel_1)) { |
1095 | const size_t num_chunks = length / kSixteenBitSimdLength; |
1096 | leftover_samples = length % kSixteenBitSimdLength; |
1097 | int16x8_t* channel_0_vec = reinterpret_cast<int16x8_t*>(channel_0); |
1098 | int16x8_t* channel_1_vec = reinterpret_cast<int16x8_t*>(channel_1); |
1099 | int16x8x2_t deinterleaved_pair; |
1100 | const int16x8_t* interleaved_vec = |
1101 | reinterpret_cast<const int16x8_t*>(interleaved_buffer); |
1102 | for (size_t chunk = 0; chunk < num_chunks; ++chunk) { |
1103 | const size_t interleaved_index = chunk * kNumStereoChannels; |
1104 | deinterleaved_pair = vuzpq_s16(interleaved_vec[interleaved_index], |
1105 | interleaved_vec[interleaved_index + 1]); |
1106 | channel_0_vec[chunk] = deinterleaved_pair.val[0]; |
1107 | channel_1_vec[chunk] = deinterleaved_pair.val[1]; |
1108 | } |
1109 | } |
1110 | #endif // SIMD_NEON |
1111 | for (size_t i = length - leftover_samples; i < length; ++i) { |
1112 | const size_t interleaved_index = kNumStereoChannels * i; |
1113 | channel_0[i] = interleaved_buffer[interleaved_index]; |
1114 | channel_1[i] = interleaved_buffer[interleaved_index + 1]; |
1115 | } |
1116 | } |
1117 | |
1118 | void DeinterleaveStereo(size_t length, const float* interleaved_buffer, |
1119 | float* channel_0, float* channel_1) { |
1120 | DCHECK(interleaved_buffer); |
1121 | DCHECK(channel_0); |
1122 | DCHECK(channel_1); |
1123 | |
1124 | size_t leftover_samples = length; |
1125 | #ifdef SIMD_NEON |
1126 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
1127 | IsAligned(channel_1)) { |
1128 | const size_t num_chunks = GetNumChunks(length); |
1129 | leftover_samples = GetLeftoverSamples(length); |
1130 | SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0); |
1131 | SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1); |
1132 | float32x4x2_t deinterleaved_pair; |
1133 | |
1134 | const SimdVector* interleaved_vec = |
1135 | reinterpret_cast<const SimdVector*>(interleaved_buffer); |
1136 | for (size_t chunk = 0; chunk < num_chunks; ++chunk) { |
1137 | const size_t interleaved_index = chunk * kNumStereoChannels; |
1138 | deinterleaved_pair = vuzpq_f32(interleaved_vec[interleaved_index], |
1139 | interleaved_vec[interleaved_index + 1]); |
1140 | channel_0_vec[chunk] = deinterleaved_pair.val[0]; |
1141 | channel_1_vec[chunk] = deinterleaved_pair.val[1]; |
1142 | } |
1143 | } |
1144 | #endif // SIMD_NEON |
1145 | for (size_t i = length - leftover_samples; i < length; ++i) { |
1146 | const size_t interleaved_index = kNumStereoChannels * i; |
1147 | channel_0[i] = interleaved_buffer[interleaved_index]; |
1148 | channel_1[i] = interleaved_buffer[interleaved_index + 1]; |
1149 | } |
1150 | } |
1151 | |
1152 | void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer, |
1153 | float* channel_0, float* channel_1) { |
1154 | DCHECK(interleaved_buffer); |
1155 | DCHECK(channel_0); |
1156 | DCHECK(channel_1); |
1157 | |
1158 | size_t leftover_samples = length; |
1159 | #ifdef SIMD_NEON |
1160 | if (IsAligned(interleaved_buffer) && IsAligned(channel_0) && |
1161 | IsAligned(channel_1)) { |
1162 | const size_t num_chunks = GetNumChunks(length); |
1163 | leftover_samples = GetLeftoverSamples(length); |
1164 | SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0); |
1165 | SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1); |
1166 | int16x4x2_t deinterleaved_pair; |
1167 | int32x4_t temporary_wide; |
1168 | const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16); |
1169 | |
1170 | const int16x4_t* interleaved_vec = |
1171 | reinterpret_cast<const int16x4_t*>(interleaved_buffer); |
1172 | for (size_t chunk = 0; chunk < num_chunks; ++chunk) { |
1173 | const size_t interleaved_index = chunk * kNumStereoChannels; |
1174 | deinterleaved_pair = vuzp_s16(interleaved_vec[interleaved_index], |
1175 | interleaved_vec[interleaved_index + 1]); |
1176 | temporary_wide = vmovl_s16(deinterleaved_pair.val[0]); |
1177 | channel_0_vec[chunk] = vcvtq_f32_s32(temporary_wide); |
1178 | channel_0_vec[chunk] = |
1179 | SIMD_MULTIPLY(scaling_vector, channel_0_vec[chunk]); |
1180 | temporary_wide = vmovl_s16(deinterleaved_pair.val[1]); |
1181 | channel_1_vec[chunk] = vcvtq_f32_s32(temporary_wide); |
1182 | channel_1_vec[chunk] = |
1183 | SIMD_MULTIPLY(scaling_vector, channel_1_vec[chunk]); |
1184 | } |
1185 | } |
1186 | #endif // SIMD_NEON |
1187 | for (size_t i = length - leftover_samples; i < length; ++i) { |
1188 | const size_t interleaved_index = kNumStereoChannels * i; |
1189 | channel_0[i] = static_cast<float>(interleaved_buffer[interleaved_index]) * |
1190 | kFloatFromInt16; |
1191 | channel_1[i] = |
1192 | static_cast<float>(interleaved_buffer[interleaved_index + 1]) * |
1193 | kFloatFromInt16; |
1194 | } |
1195 | } |
1196 | |
1197 | void InterleaveQuad(size_t length, const int16_t* channel_0, |
1198 | const int16_t* channel_1, const int16_t* channel_2, |
1199 | const int16_t* channel_3, int16_t* workspace, |
1200 | int16_t* interleaved_buffer) { |
1201 | #ifdef SIMD_NEON |
1202 | DCHECK(IsAligned(workspace)); |
1203 | const size_t double_length = length * 2; |
1204 | int16_t* workspace_half_point = |
1205 | workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t), |
1206 | kMemoryAlignmentBytes); |
1207 | InterleaveStereo(length, channel_0, channel_2, workspace); |
1208 | InterleaveStereo(length, channel_1, channel_3, workspace_half_point); |
1209 | InterleaveStereo(double_length, workspace, workspace_half_point, |
1210 | interleaved_buffer); |
1211 | #else |
1212 | for (size_t i = 0; i < length; ++i) { |
1213 | const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; |
1214 | interleaved_buffer[interleaved_index] = channel_0[i]; |
1215 | interleaved_buffer[interleaved_index + 1] = channel_1[i]; |
1216 | interleaved_buffer[interleaved_index + 2] = channel_2[i]; |
1217 | interleaved_buffer[interleaved_index + 3] = channel_3[i]; |
1218 | } |
1219 | #endif // SIMD_NEON |
1220 | } |
1221 | |
1222 | void InterleaveQuad(size_t length, const float* channel_0, |
1223 | const float* channel_1, const float* channel_2, |
1224 | const float* channel_3, float* workspace, |
1225 | float* interleaved_buffer) { |
1226 | #ifdef SIMD_NEON |
1227 | DCHECK(IsAligned(workspace)); |
1228 | const size_t double_length = length * 2; |
1229 | float* workspace_half_point = |
1230 | workspace + FindNextAlignedArrayIndex(double_length, sizeof(float), |
1231 | kMemoryAlignmentBytes); |
1232 | DCHECK(IsAligned(workspace_half_point)); |
1233 | InterleaveStereo(length, channel_0, channel_2, workspace); |
1234 | InterleaveStereo(length, channel_1, channel_3, workspace_half_point); |
1235 | InterleaveStereo(double_length, workspace, workspace_half_point, |
1236 | interleaved_buffer); |
1237 | #else |
1238 | for (size_t i = 0; i < length; ++i) { |
1239 | const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; |
1240 | interleaved_buffer[interleaved_index] = channel_0[i]; |
1241 | interleaved_buffer[interleaved_index + 1] = channel_1[i]; |
1242 | interleaved_buffer[interleaved_index + 2] = channel_2[i]; |
1243 | interleaved_buffer[interleaved_index + 3] = channel_3[i]; |
1244 | } |
1245 | #endif // SIMD_NEON |
1246 | } |
1247 | |
1248 | void DeinterleaveQuad(size_t length, const int16_t* interleaved_buffer, |
1249 | int16_t* workspace, int16_t* channel_0, |
1250 | int16_t* channel_1, int16_t* channel_2, |
1251 | int16_t* channel_3) { |
1252 | #ifdef SIMD_NEON |
1253 | DCHECK(IsAligned(workspace)); |
1254 | const size_t double_length = length * 2; |
1255 | int16_t* workspace_half_point = |
1256 | workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t), |
1257 | kMemoryAlignmentBytes); |
1258 | DCHECK(IsAligned(workspace_half_point)); |
1259 | DeinterleaveStereo(double_length, interleaved_buffer, workspace, |
1260 | workspace_half_point); |
1261 | DeinterleaveStereo(length, workspace, channel_0, channel_2); |
1262 | DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3); |
1263 | #else |
1264 | for (size_t i = 0; i < length; ++i) { |
1265 | const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; |
1266 | channel_0[i] = interleaved_buffer[interleaved_index]; |
1267 | channel_1[i] = interleaved_buffer[interleaved_index + 1]; |
1268 | channel_2[i] = interleaved_buffer[interleaved_index + 2]; |
1269 | channel_3[i] = interleaved_buffer[interleaved_index + 3]; |
1270 | } |
1271 | #endif // SIMD_NEON |
1272 | } |
1273 | |
1274 | void DeinterleaveQuad(size_t length, const float* interleaved_buffer, |
1275 | float* workspace, float* channel_0, float* channel_1, |
1276 | float* channel_2, float* channel_3) { |
1277 | #ifdef SIMD_NEON |
1278 | DCHECK(IsAligned(workspace)); |
1279 | const size_t double_length = length * 2; |
1280 | float* workspace_half_point = |
1281 | workspace + FindNextAlignedArrayIndex(double_length, sizeof(float), |
1282 | kMemoryAlignmentBytes); |
1283 | DCHECK(IsAligned(workspace_half_point)); |
1284 | DeinterleaveStereo(double_length, interleaved_buffer, workspace, |
1285 | workspace_half_point); |
1286 | DeinterleaveStereo(length, workspace, channel_0, channel_2); |
1287 | DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3); |
1288 | #else |
1289 | for (size_t i = 0; i < length; ++i) { |
1290 | const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i; |
1291 | channel_0[i] = interleaved_buffer[interleaved_index]; |
1292 | channel_1[i] = interleaved_buffer[interleaved_index + 1]; |
1293 | channel_2[i] = interleaved_buffer[interleaved_index + 2]; |
1294 | channel_3[i] = interleaved_buffer[interleaved_index + 3]; |
1295 | } |
1296 | #endif // SIMD_NEON |
1297 | } |
1298 | |
1299 | } // namespace vraudio |
1300 | |