Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_cos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
58
59#include <inttypes.h>
60#include <math.h>
61#include <stdio.h>
62
63#ifndef INCLUDED_volk_32f_cos_32f_a_H
64#define INCLUDED_volk_32f_cos_32f_a_H
65
66#ifdef LV_HAVE_GENERIC
67
68static inline void
69volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
70{
71 float* bPtr = bVector;
72 const float* aPtr = aVector;
73 unsigned int number = 0;
74
75 for (; number < num_points; number++) {
76 *bPtr++ = cosf(*aPtr++);
77 }
78}
79
80#endif /* LV_HAVE_GENERIC */
81
82#ifdef LV_HAVE_GENERIC
83#include <volk/volk_common.h>
84
85static inline void
86volk_32f_cos_32f_polynomial(float* bVector, const float* aVector, unsigned int num_points)
87{
88 for (unsigned int number = 0; number < num_points; number++) {
89 *bVector++ = volk_cos(*aVector++);
90 }
91}
92#endif /* LV_HAVE_GENERIC */
93
94#ifdef LV_HAVE_AVX512F
95#include <immintrin.h>
97
98static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
99 const float* inVector,
100 unsigned int num_points)
101{
102 float* cosPtr = cosVector;
103 const float* inPtr = inVector;
104
105 unsigned int number = 0;
106 unsigned int sixteenPoints = num_points / 16;
107
108 // Constants for Cody-Waite argument reduction
109 // n = round(x * 2/pi), then r = x - n * pi/2
110 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
111 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
112 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
113
114 const __m512i ones = _mm512_set1_epi32(1);
115 const __m512i twos = _mm512_set1_epi32(2);
116 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
117
118 for (; number < sixteenPoints; number++) {
119 __m512 x = _mm512_load_ps(inPtr);
120
121 // Argument reduction: n = round(x * 2/pi)
122 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
123 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
124 __m512i n = _mm512_cvtps_epi32(n_f);
125
126 // r = x - n * (pi/2), using extended precision
127 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
128 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
129
130 // Evaluate both sin and cos polynomials
131 __m512 sin_r = _mm512_sin_poly_avx512(r);
132 __m512 cos_r = _mm512_cos_poly_avx512(r);
133
134 // Reconstruct cos(x) based on quadrant (n mod 4):
135 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
136 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
137 __m512i n_and_1 = _mm512_and_si512(n, ones);
138 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
139
140 // swap_mask: where n&1 != 0, we use sin instead of cos
141 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
142 __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
143
144 // neg_mask: where (n+1)&2 != 0, we negate the result (use integer xor for
145 // AVX512F)
146 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
147 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
148 neg_mask,
149 _mm512_castps_si512(result),
150 sign_bit));
151
152 _mm512_store_ps(cosPtr, result);
153 inPtr += 16;
154 cosPtr += 16;
155 }
156
157 number = sixteenPoints * 16;
158 for (; number < num_points; number++) {
159 *cosPtr++ = cosf(*inPtr++);
160 }
161}
162#endif /* LV_HAVE_AVX512F */
163
164#if LV_HAVE_AVX2 && LV_HAVE_FMA
165#include <immintrin.h>
167
168static inline void
169volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
170{
171 float* bPtr = bVector;
172 const float* aPtr = aVector;
173
174 unsigned int number = 0;
175 unsigned int eighthPoints = num_points / 8;
176
177 // Constants for Cody-Waite argument reduction
178 // n = round(x * 2/pi), then r = x - n * pi/2
179 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
180 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
181 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
182
183 const __m256i ones = _mm256_set1_epi32(1);
184 const __m256i twos = _mm256_set1_epi32(2);
185 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
186
187 for (; number < eighthPoints; number++) {
188 __m256 x = _mm256_load_ps(aPtr);
189
190 // Argument reduction: n = round(x * 2/pi)
191 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
192 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
193 __m256i n = _mm256_cvtps_epi32(n_f);
194
195 // r = x - n * (pi/2), using extended precision
196 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
197 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
198
199 // Evaluate both sin and cos polynomials
200 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
201 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
202
203 // Reconstruct cos(x) based on quadrant (n mod 4):
204 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
205 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
206 __m256i n_and_1 = _mm256_and_si256(n, ones);
207 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
208
209 // swap_mask: where n&1 != 0, we use sin instead of cos
210 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
211 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
212
213 // neg_mask: where (n+1)&2 != 0, we negate the result
214 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
215 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
216
217 _mm256_store_ps(bPtr, result);
218 aPtr += 8;
219 bPtr += 8;
220 }
221
222 number = eighthPoints * 8;
223 for (; number < num_points; number++) {
224 *bPtr++ = cosf(*aPtr++);
225 }
226}
227
228#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
229
230#ifdef LV_HAVE_AVX2
231#include <immintrin.h>
233
234static inline void
235volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
236{
237 float* bPtr = bVector;
238 const float* aPtr = aVector;
239
240 unsigned int number = 0;
241 unsigned int eighthPoints = num_points / 8;
242
243 // Constants for Cody-Waite argument reduction
244 // n = round(x * 2/pi), then r = x - n * pi/2
245 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
246 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
247 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
248
249 const __m256i ones = _mm256_set1_epi32(1);
250 const __m256i twos = _mm256_set1_epi32(2);
251 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
252
253 for (; number < eighthPoints; number++) {
254 __m256 x = _mm256_load_ps(aPtr);
255
256 // Argument reduction: n = round(x * 2/pi)
257 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
258 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
259 __m256i n = _mm256_cvtps_epi32(n_f);
260
261 // r = x - n * (pi/2), using extended precision
262 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
263 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
264
265 // Evaluate both sin and cos polynomials
266 __m256 sin_r = _mm256_sin_poly_avx2(r);
267 __m256 cos_r = _mm256_cos_poly_avx2(r);
268
269 // Reconstruct cos(x) based on quadrant (n mod 4):
270 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
271 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
272 __m256i n_and_1 = _mm256_and_si256(n, ones);
273 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
274
275 // swap_mask: where n&1 != 0, we use sin instead of cos
276 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
277 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
278
279 // neg_mask: where (n+1)&2 != 0, we negate the result
280 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
281 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
282
283 _mm256_store_ps(bPtr, result);
284 aPtr += 8;
285 bPtr += 8;
286 }
287
288 number = eighthPoints * 8;
289 for (; number < num_points; number++) {
290 *bPtr++ = cosf(*aPtr++);
291 }
292}
293
294#endif /* LV_HAVE_AVX2 */
295
296#ifdef LV_HAVE_SSE4_1
297#include <smmintrin.h>
299
300static inline void
301volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
302{
303 float* bPtr = bVector;
304 const float* aPtr = aVector;
305
306 unsigned int number = 0;
307 unsigned int quarterPoints = num_points / 4;
308
309 // Constants for Cody-Waite argument reduction
310 // n = round(x * 2/pi), then r = x - n * pi/2
311 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
312 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
313 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
314
315 const __m128i ones = _mm_set1_epi32(1);
316 const __m128i twos = _mm_set1_epi32(2);
317 const __m128 sign_bit = _mm_set1_ps(-0.0f);
318
319 for (; number < quarterPoints; number++) {
320 __m128 x = _mm_load_ps(aPtr);
321
322 // Argument reduction: n = round(x * 2/pi)
323 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
324 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
325 __m128i n = _mm_cvtps_epi32(n_f);
326
327 // r = x - n * (pi/2), using extended precision
328 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
329 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
330
331 // Evaluate both sin and cos polynomials
332 __m128 sin_r = _mm_sin_poly_sse(r);
333 __m128 cos_r = _mm_cos_poly_sse(r);
334
335 // Reconstruct cos(x) based on quadrant (n mod 4):
336 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
337 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
338 __m128i n_and_1 = _mm_and_si128(n, ones);
339 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
340
341 // swap_mask: where n&1 != 0, we use sin instead of cos
342 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
343 __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
344
345 // neg_mask: where (n+1)&2 != 0, we negate the result
346 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
347 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
348
349 _mm_store_ps(bPtr, result);
350 aPtr += 4;
351 bPtr += 4;
352 }
353
354 number = quarterPoints * 4;
355 for (; number < num_points; number++) {
356 *bPtr++ = cosf(*aPtr++);
357 }
358}
359
360#endif /* LV_HAVE_SSE4_1 */
361
362#endif /* INCLUDED_volk_32f_cos_32f_a_H */
363
364
365#ifndef INCLUDED_volk_32f_cos_32f_u_H
366#define INCLUDED_volk_32f_cos_32f_u_H
367
368#ifdef LV_HAVE_AVX512F
369#include <immintrin.h>
371
372static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
373 const float* inVector,
374 unsigned int num_points)
375{
376 float* cosPtr = cosVector;
377 const float* inPtr = inVector;
378
379 unsigned int number = 0;
380 unsigned int sixteenPoints = num_points / 16;
381
382 // Constants for Cody-Waite argument reduction
383 // n = round(x * 2/pi), then r = x - n * pi/2
384 const __m512 two_over_pi = _mm512_set1_ps(0x1.45f306p-1f); // 2/pi
385 const __m512 pi_over_2_hi = _mm512_set1_ps(0x1.921fb6p+0f); // pi/2 high
386 const __m512 pi_over_2_lo = _mm512_set1_ps(-0x1.777a5cp-25f); // pi/2 low
387
388 const __m512i ones = _mm512_set1_epi32(1);
389 const __m512i twos = _mm512_set1_epi32(2);
390 const __m512i sign_bit = _mm512_set1_epi32(0x80000000);
391
392 for (; number < sixteenPoints; number++) {
393 __m512 x = _mm512_loadu_ps(inPtr);
394
395 // Argument reduction: n = round(x * 2/pi)
396 __m512 n_f = _mm512_roundscale_ps(_mm512_mul_ps(x, two_over_pi),
397 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
398 __m512i n = _mm512_cvtps_epi32(n_f);
399
400 // r = x - n * (pi/2), using extended precision
401 __m512 r = _mm512_fnmadd_ps(n_f, pi_over_2_hi, x);
402 r = _mm512_fnmadd_ps(n_f, pi_over_2_lo, r);
403
404 // Evaluate both sin and cos polynomials
405 __m512 sin_r = _mm512_sin_poly_avx512(r);
406 __m512 cos_r = _mm512_cos_poly_avx512(r);
407
408 // Reconstruct cos(x) based on quadrant (n mod 4):
409 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
410 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
411 __m512i n_and_1 = _mm512_and_si512(n, ones);
412 __m512i n_plus_1_and_2 = _mm512_and_si512(_mm512_add_epi32(n, ones), twos);
413
414 // swap_mask: where n&1 != 0, we use sin instead of cos
415 __mmask16 swap_mask = _mm512_cmpeq_epi32_mask(n_and_1, ones);
416 __m512 result = _mm512_mask_blend_ps(swap_mask, cos_r, sin_r);
417
418 // neg_mask: where (n+1)&2 != 0, we negate the result (use integer xor for
419 // AVX512F)
420 __mmask16 neg_mask = _mm512_cmpeq_epi32_mask(n_plus_1_and_2, twos);
421 result = _mm512_castsi512_ps(_mm512_mask_xor_epi32(_mm512_castps_si512(result),
422 neg_mask,
423 _mm512_castps_si512(result),
424 sign_bit));
425
426 _mm512_storeu_ps(cosPtr, result);
427 inPtr += 16;
428 cosPtr += 16;
429 }
430
431 number = sixteenPoints * 16;
432 for (; number < num_points; number++) {
433 *cosPtr++ = cosf(*inPtr++);
434 }
435}
436#endif /* LV_HAVE_AVX512F */
437
438#if LV_HAVE_AVX2 && LV_HAVE_FMA
439#include <immintrin.h>
441
442static inline void
443volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
444{
445 float* bPtr = bVector;
446 const float* aPtr = aVector;
447
448 unsigned int number = 0;
449 unsigned int eighthPoints = num_points / 8;
450
451 // Constants for Cody-Waite argument reduction
452 // n = round(x * 2/pi), then r = x - n * pi/2
453 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
454 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
455 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
456
457 const __m256i ones = _mm256_set1_epi32(1);
458 const __m256i twos = _mm256_set1_epi32(2);
459 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
460
461 for (; number < eighthPoints; number++) {
462 __m256 x = _mm256_loadu_ps(aPtr);
463
464 // Argument reduction: n = round(x * 2/pi)
465 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
466 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
467 __m256i n = _mm256_cvtps_epi32(n_f);
468
469 // r = x - n * (pi/2), using extended precision
470 __m256 r = _mm256_fnmadd_ps(n_f, pi_over_2_hi, x);
471 r = _mm256_fnmadd_ps(n_f, pi_over_2_lo, r);
472
473 // Evaluate both sin and cos polynomials
474 __m256 sin_r = _mm256_sin_poly_avx2_fma(r);
475 __m256 cos_r = _mm256_cos_poly_avx2_fma(r);
476
477 // Reconstruct cos(x) based on quadrant (n mod 4):
478 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
479 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
480 __m256i n_and_1 = _mm256_and_si256(n, ones);
481 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
482
483 // swap_mask: where n&1 != 0, we use sin instead of cos
484 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
485 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
486
487 // neg_mask: where (n+1)&2 != 0, we negate the result
488 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
489 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
490
491 _mm256_storeu_ps(bPtr, result);
492 aPtr += 8;
493 bPtr += 8;
494 }
495
496 number = eighthPoints * 8;
497 for (; number < num_points; number++) {
498 *bPtr++ = cosf(*aPtr++);
499 }
500}
501
502#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
503
504#ifdef LV_HAVE_AVX2
505#include <immintrin.h>
507
508static inline void
509volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
510{
511 float* bPtr = bVector;
512 const float* aPtr = aVector;
513
514 unsigned int number = 0;
515 unsigned int eighthPoints = num_points / 8;
516
517 // Constants for Cody-Waite argument reduction
518 // n = round(x * 2/pi), then r = x - n * pi/2
519 const __m256 two_over_pi = _mm256_set1_ps(0x1.45f306p-1f); // 2/pi
520 const __m256 pi_over_2_hi = _mm256_set1_ps(0x1.921fb6p+0f); // pi/2 high
521 const __m256 pi_over_2_lo = _mm256_set1_ps(-0x1.777a5cp-25f); // pi/2 low
522
523 const __m256i ones = _mm256_set1_epi32(1);
524 const __m256i twos = _mm256_set1_epi32(2);
525 const __m256 sign_bit = _mm256_set1_ps(-0.0f);
526
527 for (; number < eighthPoints; number++) {
528 __m256 x = _mm256_loadu_ps(aPtr);
529
530 // Argument reduction: n = round(x * 2/pi)
531 __m256 n_f = _mm256_round_ps(_mm256_mul_ps(x, two_over_pi),
532 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
533 __m256i n = _mm256_cvtps_epi32(n_f);
534
535 // r = x - n * (pi/2), using extended precision
536 __m256 r = _mm256_sub_ps(x, _mm256_mul_ps(n_f, pi_over_2_hi));
537 r = _mm256_sub_ps(r, _mm256_mul_ps(n_f, pi_over_2_lo));
538
539 // Evaluate both sin and cos polynomials
540 __m256 sin_r = _mm256_sin_poly_avx2(r);
541 __m256 cos_r = _mm256_cos_poly_avx2(r);
542
543 // Reconstruct cos(x) based on quadrant (n mod 4):
544 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
545 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
546 __m256i n_and_1 = _mm256_and_si256(n, ones);
547 __m256i n_plus_1_and_2 = _mm256_and_si256(_mm256_add_epi32(n, ones), twos);
548
549 // swap_mask: where n&1 != 0, we use sin instead of cos
550 __m256 swap_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_and_1, ones));
551 __m256 result = _mm256_blendv_ps(cos_r, sin_r, swap_mask);
552
553 // neg_mask: where (n+1)&2 != 0, we negate the result
554 __m256 neg_mask = _mm256_castsi256_ps(_mm256_cmpeq_epi32(n_plus_1_and_2, twos));
555 result = _mm256_xor_ps(result, _mm256_and_ps(neg_mask, sign_bit));
556
557 _mm256_storeu_ps(bPtr, result);
558 aPtr += 8;
559 bPtr += 8;
560 }
561
562 number = eighthPoints * 8;
563 for (; number < num_points; number++) {
564 *bPtr++ = cosf(*aPtr++);
565 }
566}
567
568#endif /* LV_HAVE_AVX2 */
569
570#ifdef LV_HAVE_SSE4_1
571#include <smmintrin.h>
573
574static inline void
575volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
576{
577 float* bPtr = bVector;
578 const float* aPtr = aVector;
579
580 unsigned int number = 0;
581 unsigned int quarterPoints = num_points / 4;
582
583 // Constants for Cody-Waite argument reduction
584 // n = round(x * 2/pi), then r = x - n * pi/2
585 const __m128 two_over_pi = _mm_set1_ps(0x1.45f306p-1f); // 2/pi
586 const __m128 pi_over_2_hi = _mm_set1_ps(0x1.921fb6p+0f); // pi/2 high
587 const __m128 pi_over_2_lo = _mm_set1_ps(-0x1.777a5cp-25f); // pi/2 low
588
589 const __m128i ones = _mm_set1_epi32(1);
590 const __m128i twos = _mm_set1_epi32(2);
591 const __m128 sign_bit = _mm_set1_ps(-0.0f);
592
593 for (; number < quarterPoints; number++) {
594 __m128 x = _mm_loadu_ps(aPtr);
595
596 // Argument reduction: n = round(x * 2/pi)
597 __m128 n_f = _mm_round_ps(_mm_mul_ps(x, two_over_pi),
598 _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
599 __m128i n = _mm_cvtps_epi32(n_f);
600
601 // r = x - n * (pi/2), using extended precision
602 __m128 r = _mm_sub_ps(x, _mm_mul_ps(n_f, pi_over_2_hi));
603 r = _mm_sub_ps(r, _mm_mul_ps(n_f, pi_over_2_lo));
604
605 // Evaluate both sin and cos polynomials
606 __m128 sin_r = _mm_sin_poly_sse(r);
607 __m128 cos_r = _mm_cos_poly_sse(r);
608
609 // Reconstruct cos(x) based on quadrant (n mod 4):
610 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
611 // (n+1)&2 == 0: positive, (n+1)&2 != 0: negative
612 __m128i n_and_1 = _mm_and_si128(n, ones);
613 __m128i n_plus_1_and_2 = _mm_and_si128(_mm_add_epi32(n, ones), twos);
614
615 // swap_mask: where n&1 != 0, we use sin instead of cos
616 __m128 swap_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_and_1, ones));
617 __m128 result = _mm_blendv_ps(cos_r, sin_r, swap_mask);
618
619 // neg_mask: where (n+1)&2 != 0, we negate the result
620 __m128 neg_mask = _mm_castsi128_ps(_mm_cmpeq_epi32(n_plus_1_and_2, twos));
621 result = _mm_xor_ps(result, _mm_and_ps(neg_mask, sign_bit));
622
623 _mm_storeu_ps(bPtr, result);
624 aPtr += 4;
625 bPtr += 4;
626 }
627
628 number = quarterPoints * 4;
629 for (; number < num_points; number++) {
630 *bPtr++ = cosf(*aPtr++);
631 }
632}
633
634#endif /* LV_HAVE_SSE4_1 */
635
636
637#ifdef LV_HAVE_NEON
638#include <arm_neon.h>
640
641/* NEON polynomial-based cos using Cody-Waite argument reduction */
642static inline void
643volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
644{
645 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
646 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
647 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
648 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
649
650 const int32x4_t ones = vdupq_n_s32(1);
651 const int32x4_t twos = vdupq_n_s32(2);
652 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
653 const float32x4_t half = vdupq_n_f32(0.5f);
654 const float32x4_t neg_half = vdupq_n_f32(-0.5f);
655 const float32x4_t fzeroes = vdupq_n_f32(0.0f);
656
657 unsigned int number = 0;
658 const unsigned int quarterPoints = num_points / 4;
659
660 for (; number < quarterPoints; number++) {
661 float32x4_t x = vld1q_f32(aVector);
662 aVector += 4;
663
664 // n = round(x * 2/pi) - emulate round-to-nearest for ARMv7
665 float32x4_t scaled = vmulq_f32(x, two_over_pi);
666 uint32x4_t is_neg = vcltq_f32(scaled, fzeroes);
667 float32x4_t adj = vbslq_f32(is_neg, neg_half, half);
668 float32x4_t n_f = vcvtq_f32_s32(vcvtq_s32_f32(vaddq_f32(scaled, adj)));
669 int32x4_t n = vcvtq_s32_f32(n_f);
670
671 // r = x - n * (pi/2) using extended precision
672 float32x4_t r = vmlsq_f32(x, n_f, pi_over_2_hi);
673 r = vmlsq_f32(r, n_f, pi_over_2_lo);
674
675 // Evaluate sin and cos polynomials
676 float32x4_t sin_r = _vsin_poly_f32(r);
677 float32x4_t cos_r = _vcos_poly_f32(r);
678
679 // Quadrant-based reconstruction for cos:
680 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
681 // (n+1)&2 == 0: positive, (n+1)&2 == 2: negative
682 int32x4_t n_and_1 = vandq_s32(n, ones);
683 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
684
685 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
686 float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
687
688 uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
689 result = vreinterpretq_f32_u32(
690 veorq_u32(vreinterpretq_u32_f32(result),
691 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
692
693 vst1q_f32(bVector, result);
694 bVector += 4;
695 }
696
697 for (number = quarterPoints * 4; number < num_points; number++) {
698 *bVector++ = cosf(*aVector++);
699 }
700}
701#endif /* LV_HAVE_NEON */
702
703#ifdef LV_HAVE_NEONV8
704#include <arm_neon.h>
706
707/* NEONv8 polynomial-based cos using Cody-Waite argument reduction with FMA */
708static inline void
709volk_32f_cos_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
710{
711 // Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
712 const float32x4_t two_over_pi = vdupq_n_f32(0x1.45f306p-1f); // 2/pi
713 const float32x4_t pi_over_2_hi = vdupq_n_f32(0x1.921fb6p+0f); // pi/2 high
714 const float32x4_t pi_over_2_lo = vdupq_n_f32(-0x1.777a5cp-25f); // pi/2 low
715
716 const int32x4_t ones = vdupq_n_s32(1);
717 const int32x4_t twos = vdupq_n_s32(2);
718 const float32x4_t sign_bit = vdupq_n_f32(-0.0f);
719
720 unsigned int number = 0;
721 const unsigned int quarterPoints = num_points / 4;
722
723 for (; number < quarterPoints; number++) {
724 float32x4_t x = vld1q_f32(aVector);
725 aVector += 4;
726
727 // n = round(x * 2/pi) using ARMv8 vrndnq_f32
728 float32x4_t n_f = vrndnq_f32(vmulq_f32(x, two_over_pi));
729 int32x4_t n = vcvtq_s32_f32(n_f);
730
731 // r = x - n * (pi/2) using FMA for extended precision
732 float32x4_t r = vfmsq_f32(x, n_f, pi_over_2_hi);
733 r = vfmsq_f32(r, n_f, pi_over_2_lo);
734
735 // Evaluate sin and cos polynomials using FMA
736 float32x4_t sin_r = _vsin_poly_neonv8(r);
737 float32x4_t cos_r = _vcos_poly_neonv8(r);
738
739 // Quadrant-based reconstruction for cos:
740 // n&1 == 0: use cos_r, n&1 == 1: use sin_r
741 // (n+1)&2 == 0: positive, (n+1)&2 == 2: negative
742 int32x4_t n_and_1 = vandq_s32(n, ones);
743 int32x4_t n_plus_1_and_2 = vandq_s32(vaddq_s32(n, ones), twos);
744
745 uint32x4_t swap_mask = vceqq_s32(n_and_1, ones);
746 float32x4_t result = vbslq_f32(swap_mask, sin_r, cos_r);
747
748 uint32x4_t neg_mask = vceqq_s32(n_plus_1_and_2, twos);
749 result = vreinterpretq_f32_u32(
750 veorq_u32(vreinterpretq_u32_f32(result),
751 vandq_u32(neg_mask, vreinterpretq_u32_f32(sign_bit))));
752
753 vst1q_f32(bVector, result);
754 bVector += 4;
755 }
756
757 for (number = quarterPoints * 4; number < num_points; number++) {
758 *bVector++ = cosf(*aVector++);
759 }
760}
761
762#endif /* LV_HAVE_NEONV8 */
763
764#ifdef LV_HAVE_RVV
765#include <riscv_vector.h>
766
767static inline void
768volk_32f_cos_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
769{
770 size_t vlmax = __riscv_vsetvlmax_e32m2();
771
772 const vfloat32m2_t c4oPi = __riscv_vfmv_v_f_f32m2(1.2732395f, vlmax);
773 const vfloat32m2_t cPio4a = __riscv_vfmv_v_f_f32m2(0.7853982f, vlmax);
774 const vfloat32m2_t cPio4b = __riscv_vfmv_v_f_f32m2(7.946627e-09f, vlmax);
775 const vfloat32m2_t cPio4c = __riscv_vfmv_v_f_f32m2(3.061617e-17f, vlmax);
776
777 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
778 const vfloat32m2_t cf4 = __riscv_vfmv_v_f_f32m2(4.0f, vlmax);
779
780 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(0.0833333333f, vlmax);
781 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(0.0027777778f, vlmax);
782 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(4.9603175e-05f, vlmax);
783 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.5114638e-07f, vlmax);
784
785 size_t n = num_points;
786 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
787 vl = __riscv_vsetvl_e32m2(n);
788 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
789 vfloat32m2_t s = __riscv_vfabs(v, vl);
790 vint32m2_t q = __riscv_vfcvt_x(__riscv_vfmul(s, c4oPi, vl), vl);
791 vfloat32m2_t r = __riscv_vfcvt_f(__riscv_vadd(q, __riscv_vand(q, 1, vl), vl), vl);
792
793 s = __riscv_vfnmsac(s, cPio4a, r, vl);
794 s = __riscv_vfnmsac(s, cPio4b, r, vl);
795 s = __riscv_vfnmsac(s, cPio4c, r, vl);
796
797 s = __riscv_vfmul(s, 1 / 8.0f, vl);
798 s = __riscv_vfmul(s, s, vl);
799 vfloat32m2_t t = s;
800 s = __riscv_vfmsub(s, c5, c4, vl);
801 s = __riscv_vfmadd(s, t, c3, vl);
802 s = __riscv_vfmsub(s, t, c2, vl);
803 s = __riscv_vfmadd(s, t, cf1, vl);
804 s = __riscv_vfmul(s, t, vl);
805 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
806 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
807 s = __riscv_vfmul(s, __riscv_vfsub(cf4, s, vl), vl);
808 s = __riscv_vfmul(s, 1 / 2.0f, vl);
809
810 vfloat32m2_t sine =
811 __riscv_vfsqrt(__riscv_vfmul(__riscv_vfrsub(s, 2.0f, vl), s, vl), vl);
812 vfloat32m2_t cosine = __riscv_vfsub(cf1, s, vl);
813
814 vbool16_t m1 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 1, vl), 2, vl), 0, vl);
815 vbool16_t m2 = __riscv_vmsne(__riscv_vand(__riscv_vadd(q, 2, vl), 4, vl), 0, vl);
816
817 cosine = __riscv_vmerge(cosine, sine, m1, vl);
818 cosine = __riscv_vfneg_mu(m2, cosine, cosine, vl);
819
820 __riscv_vse32(bVector, cosine, vl);
821 }
822}
823#endif /*LV_HAVE_RVV*/
824
825#endif /* INCLUDED_volk_32f_cos_32f_u_H */