Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32fc_x2_rotator2_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
67
68#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
69#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
70
71
72#include <math.h>
73#include <stdio.h>
74#include <stdlib.h>
75#include <volk/volk_complex.h>
76#define ROTATOR_RELOAD 512
77#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
78#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
79#define ROTATOR_RELOAD_8 (ROTATOR_RELOAD / 8)
80
81
82#ifdef LV_HAVE_GENERIC
83
85 const lv_32fc_t* inVector,
86 const lv_32fc_t* phase_inc,
87 lv_32fc_t* phase,
88 unsigned int num_points)
89{
90 unsigned int i = 0;
91 int j = 0;
92 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
93 for (j = 0; j < ROTATOR_RELOAD; ++j) {
94 *outVector++ = *inVector++ * (*phase);
95 (*phase) *= *phase_inc;
96 }
97
98 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
99 }
100 for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
101 *outVector++ = *inVector++ * (*phase);
102 (*phase) *= *phase_inc;
103 }
104 if (i) {
105 // Make sure, we normalize phase on every call!
106 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
107 }
108}
109
110#endif /* LV_HAVE_GENERIC */
111
112
113#ifdef LV_HAVE_NEON
114#include <arm_neon.h>
116
117#ifndef M_PI
118#define M_PI 3.14159265358979323846
119#endif
120
128 const lv_32fc_t* inVector,
129 const lv_32fc_t* phase_inc,
130 lv_32fc_t* phase,
131 unsigned int num_points)
132{
133 lv_32fc_t* outputVectorPtr = outVector;
134 const lv_32fc_t* inputVectorPtr = inVector;
135
136 // Extract angles using double precision
137 const double initial_angle =
138 atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
139 const double delta_angle =
140 atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
141
142 // Kahan summation state for angle accumulation
143 double angle_sum = initial_angle;
144 double angle_c = 0.0; // Compensation for lost low-order bits
145
146 // Precompute block increment
147 const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
148
149 // Compute incr = phase_inc^4 from exact angle (more accurate than multiplication)
150 const double delta4 = 4.0 * delta_angle;
151 lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
152
153 float32x4x2_t input_vec;
154 float32x4x2_t output_vec;
155 lv_32fc_t phasePtr[4];
156
157 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
158 const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
159 float32x4x2_t phase_vec;
160
161// Helper macro for angle reduction
162#define REDUCE_ANGLE(a) \
163 do { \
164 (a) = fmod((a), 2.0 * M_PI); \
165 if ((a) > M_PI) \
166 (a) -= 2.0 * M_PI; \
167 else if ((a) < -M_PI) \
168 (a) += 2.0 * M_PI; \
169 } while (0)
170
171 // Initialize phase vector from exact angles
172 for (unsigned int k = 0; k < 4; ++k) {
173 double a = angle_sum + (double)k * delta_angle;
174 REDUCE_ANGLE(a);
175 phasePtr[k] = lv_cmake((float)cos(a), (float)sin(a));
176 }
177 phase_vec = vld2q_f32((float*)phasePtr);
178
179 unsigned int i, j;
180
181 // Main loop with periodic resync
182 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
183 // Inner loop: use fast SIMD multiply
184 for (j = 0; j < ROTATOR_RELOAD_4; j++) {
185 input_vec = vld2q_f32((float*)inputVectorPtr);
186 __VOLK_PREFETCH(inputVectorPtr + 4);
187
188 // Rotate
189 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
190 // Increase phase
191 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
192 // Store output
193 vst2q_f32((float*)outputVectorPtr, output_vec);
194
195 outputVectorPtr += 4;
196 inputVectorPtr += 4;
197 }
198
199 // Advance angle using Kahan summation (compensated for rounding error)
200 {
201 double y = block_delta - angle_c;
202 double t = angle_sum + y;
203 angle_c = (t - angle_sum) - y;
204 angle_sum = t;
205 }
206
207 // Resync: recompute phase from accumulated angle (eliminates drift)
208 for (unsigned int k = 0; k < 4; ++k) {
209 double a = angle_sum + (double)k * delta_angle;
210 REDUCE_ANGLE(a);
211 phasePtr[k] = lv_cmake((float)cos(a), (float)sin(a));
212 }
213 phase_vec = vld2q_f32((float*)phasePtr);
214 }
215
216 // Handle remainder
217 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
218 input_vec = vld2q_f32((float*)inputVectorPtr);
219 __VOLK_PREFETCH(inputVectorPtr + 4);
220
221 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
222 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
223 vst2q_f32((float*)outputVectorPtr, output_vec);
224
225 outputVectorPtr += 4;
226 inputVectorPtr += 4;
227
228 // Update angle for remainder
229 {
230 double y = (4.0 * delta_angle) - angle_c;
231 double t = angle_sum + y;
232 angle_c = (t - angle_sum) - y;
233 angle_sum = t;
234 }
235 }
236
237 // Final phase output from exact angle
238 {
239 double a = angle_sum;
240 REDUCE_ANGLE(a);
241 *phase = lv_cmake((float)cos(a), (float)sin(a));
242 }
243
244 // Scalar remainder
245 for (i = 0; i < num_points % 4; i++) {
246 *outputVectorPtr++ = *inputVectorPtr++ * (*phase);
247 {
248 double y = delta_angle - angle_c;
249 double t = angle_sum + y;
250 angle_c = (t - angle_sum) - y;
251 angle_sum = t;
252 }
253 double a = angle_sum;
254 REDUCE_ANGLE(a);
255 *phase = lv_cmake((float)cos(a), (float)sin(a));
256 }
257
258#undef REDUCE_ANGLE
259}
260
261#endif /* LV_HAVE_NEON */
262
263
264#ifdef LV_HAVE_AVX
265#include <immintrin.h>
267
268#ifndef M_PI
269#define M_PI 3.14159265358979323846
270#endif
271
279 const lv_32fc_t* inVector,
280 const lv_32fc_t* phase_inc,
281 lv_32fc_t* phase,
282 unsigned int num_points)
283{
284 lv_32fc_t* cPtr = outVector;
285 const lv_32fc_t* aPtr = inVector;
286
287 // Extract angles using double precision
288 const double initial_angle =
289 atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
290 const double delta_angle =
291 atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
292
293 // Kahan summation state for angle accumulation
294 double angle_sum = initial_angle;
295 double angle_c = 0.0; // Compensation for lost low-order bits
296
297 // Precompute block increment
298 const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
299
300 // Compute incr = phase_inc^4 from exact angle (more accurate than multiplication)
301 const double delta4 = 4.0 * delta_angle;
302 lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
303
304 __m256 aVal, phase_Val, z;
305 lv_32fc_t phase_Ptr[4];
306
307 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
308 lv_creal(incr),
309 lv_cimag(incr),
310 lv_creal(incr),
311 lv_cimag(incr),
312 lv_creal(incr),
313 lv_cimag(incr),
314 lv_creal(incr));
315
316// Helper macro for angle reduction
317#define REDUCE_ANGLE(a) \
318 do { \
319 (a) = fmod((a), 2.0 * M_PI); \
320 if ((a) > M_PI) \
321 (a) -= 2.0 * M_PI; \
322 else if ((a) < -M_PI) \
323 (a) += 2.0 * M_PI; \
324 } while (0)
325
326 // Initialize phase vector from exact angles
327 for (unsigned int k = 0; k < 4; ++k) {
328 double a = angle_sum + (double)k * delta_angle;
329 REDUCE_ANGLE(a);
330 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
331 }
332 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
333
334 unsigned int i, j;
335
336 // Main loop with periodic resync
337 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
338 // Inner loop: use fast SIMD multiply
339 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
340 aVal = _mm256_loadu_ps((float*)aPtr);
341
342 z = _mm256_complexmul_ps(aVal, phase_Val);
343 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
344
345 _mm256_storeu_ps((float*)cPtr, z);
346
347 aPtr += 4;
348 cPtr += 4;
349 }
350
351 // Advance angle using Kahan summation (compensated for rounding error)
352 {
353 double y = block_delta - angle_c;
354 double t = angle_sum + y;
355 angle_c = (t - angle_sum) - y;
356 angle_sum = t;
357 }
358
359 // Resync: recompute phase from accumulated angle (eliminates drift)
360 for (unsigned int k = 0; k < 4; ++k) {
361 double a = angle_sum + (double)k * delta_angle;
362 REDUCE_ANGLE(a);
363 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
364 }
365 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
366 }
367
368 // Handle remainder
369 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
370 aVal = _mm256_loadu_ps((float*)aPtr);
371
372 z = _mm256_complexmul_ps(aVal, phase_Val);
373 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
374
375 _mm256_storeu_ps((float*)cPtr, z);
376
377 aPtr += 4;
378 cPtr += 4;
379
380 // Update angle for remainder
381 {
382 double y = (4.0 * delta_angle) - angle_c;
383 double t = angle_sum + y;
384 angle_c = (t - angle_sum) - y;
385 angle_sum = t;
386 }
387 }
388
389 // Final phase output from exact angle
390 {
391 double a = angle_sum;
392 REDUCE_ANGLE(a);
393 *phase = lv_cmake((float)cos(a), (float)sin(a));
394 }
395
396 // Scalar remainder
397 for (i = 0; i < num_points % 4; ++i) {
398 *cPtr++ = *aPtr++ * (*phase);
399 {
400 double y = delta_angle - angle_c;
401 double t = angle_sum + y;
402 angle_c = (t - angle_sum) - y;
403 angle_sum = t;
404 }
405 double a = angle_sum;
406 REDUCE_ANGLE(a);
407 *phase = lv_cmake((float)cos(a), (float)sin(a));
408 }
409
410#undef REDUCE_ANGLE
411}
412
413#endif /* LV_HAVE_AVX */
414
415
416#ifdef LV_HAVE_AVX
417#include <immintrin.h>
418
426 const lv_32fc_t* inVector,
427 const lv_32fc_t* phase_inc,
428 lv_32fc_t* phase,
429 unsigned int num_points)
430{
431 lv_32fc_t* cPtr = outVector;
432 const lv_32fc_t* aPtr = inVector;
433
434 // Extract angles using double precision
435 const double initial_angle =
436 atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
437 const double delta_angle =
438 atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
439
440 // Kahan summation state for angle accumulation
441 double angle_sum = initial_angle;
442 double angle_c = 0.0;
443
444 // Precompute block increment
445 const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
446
447 // Compute incr = phase_inc^4 from exact angle
448 const double delta4 = 4.0 * delta_angle;
449 lv_32fc_t incr = lv_cmake((float)cos(delta4), (float)sin(delta4));
450
451 __m256 aVal, phase_Val, z;
452 lv_32fc_t phase_Ptr[4];
453
454 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
455 lv_creal(incr),
456 lv_cimag(incr),
457 lv_creal(incr),
458 lv_cimag(incr),
459 lv_creal(incr),
460 lv_cimag(incr),
461 lv_creal(incr));
462
463#define REDUCE_ANGLE(a) \
464 do { \
465 (a) = fmod((a), 2.0 * M_PI); \
466 if ((a) > M_PI) \
467 (a) -= 2.0 * M_PI; \
468 else if ((a) < -M_PI) \
469 (a) += 2.0 * M_PI; \
470 } while (0)
471
472 // Initialize phase vector from exact angles
473 for (unsigned int k = 0; k < 4; ++k) {
474 double a = angle_sum + (double)k * delta_angle;
475 REDUCE_ANGLE(a);
476 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
477 }
478 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
479
480 unsigned int i, j;
481
482 // Main loop with periodic resync
483 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
484 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
485 aVal = _mm256_loadu_ps((float*)aPtr);
486 z = _mm256_complexmul_ps(aVal, phase_Val);
487 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
488 _mm256_storeu_ps((float*)cPtr, z);
489 aPtr += 4;
490 cPtr += 4;
491 }
492
493 // Advance angle using Kahan summation
494 {
495 double y = block_delta - angle_c;
496 double t = angle_sum + y;
497 angle_c = (t - angle_sum) - y;
498 angle_sum = t;
499 }
500
501 // Resync: recompute phase from accumulated angle
502 for (unsigned int k = 0; k < 4; ++k) {
503 double a = angle_sum + (double)k * delta_angle;
504 REDUCE_ANGLE(a);
505 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
506 }
507 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
508 }
509
510 // Handle remainder
511 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
512 aVal = _mm256_loadu_ps((float*)aPtr);
513 z = _mm256_complexmul_ps(aVal, phase_Val);
514 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
515 _mm256_storeu_ps((float*)cPtr, z);
516 aPtr += 4;
517 cPtr += 4;
518
519 {
520 double y = (4.0 * delta_angle) - angle_c;
521 double t = angle_sum + y;
522 angle_c = (t - angle_sum) - y;
523 angle_sum = t;
524 }
525 }
526
527 // Final phase output
528 {
529 double a = angle_sum;
530 REDUCE_ANGLE(a);
531 *phase = lv_cmake((float)cos(a), (float)sin(a));
532 }
533
534 // Scalar remainder
535 for (i = 0; i < num_points % 4; ++i) {
536 *cPtr++ = *aPtr++ * (*phase);
537 {
538 double y = delta_angle - angle_c;
539 double t = angle_sum + y;
540 angle_c = (t - angle_sum) - y;
541 angle_sum = t;
542 }
543 double a = angle_sum;
544 REDUCE_ANGLE(a);
545 *phase = lv_cmake((float)cos(a), (float)sin(a));
546 }
547
548#undef REDUCE_ANGLE
549}
550
551#endif /* LV_HAVE_AVX */
552
553
554#ifdef LV_HAVE_AVX512F
555#include <immintrin.h>
557
564static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx512f(lv_32fc_t* outVector,
565 const lv_32fc_t* inVector,
566 const lv_32fc_t* phase_inc,
567 lv_32fc_t* phase,
568 unsigned int num_points)
569{
570 lv_32fc_t* cPtr = outVector;
571 const lv_32fc_t* aPtr = inVector;
572
573 // Extract angles using double precision
574 const double initial_angle =
575 atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
576 const double delta_angle =
577 atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
578
579 // Kahan summation state for angle accumulation
580 double angle_sum = initial_angle;
581 double angle_c = 0.0;
582
583 // Precompute block increment
584 const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
585
586 // Compute incr = phase_inc^8 from exact angle
587 const double delta8 = 8.0 * delta_angle;
588 lv_32fc_t incr = lv_cmake((float)cos(delta8), (float)sin(delta8));
589
590 __m512 aVal, phase_Val, z;
592 lv_32fc_t phase_Ptr[8];
593
594 const __m512 inc_Val = _mm512_set_ps(lv_cimag(incr),
595 lv_creal(incr),
596 lv_cimag(incr),
597 lv_creal(incr),
598 lv_cimag(incr),
599 lv_creal(incr),
600 lv_cimag(incr),
601 lv_creal(incr),
602 lv_cimag(incr),
603 lv_creal(incr),
604 lv_cimag(incr),
605 lv_creal(incr),
606 lv_cimag(incr),
607 lv_creal(incr),
608 lv_cimag(incr),
609 lv_creal(incr));
610
611#define REDUCE_ANGLE(a) \
612 do { \
613 (a) = fmod((a), 2.0 * M_PI); \
614 if ((a) > M_PI) \
615 (a) -= 2.0 * M_PI; \
616 else if ((a) < -M_PI) \
617 (a) += 2.0 * M_PI; \
618 } while (0)
619
620 // Initialize phase vector from exact angles
621 for (unsigned int k = 0; k < 8; ++k) {
622 double a = angle_sum + (double)k * delta_angle;
623 REDUCE_ANGLE(a);
624 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
625 }
626 phase_Val = _mm512_load_ps((float*)phase_Ptr);
627
628 unsigned int i, j;
629
630 // Main loop with periodic resync
631 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
632 for (j = 0; j < ROTATOR_RELOAD_8; ++j) {
633 aVal = _mm512_load_ps((float*)aPtr);
634 z = _mm512_complexmul_ps(aVal, phase_Val);
635 phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
636 _mm512_store_ps((float*)cPtr, z);
637 aPtr += 8;
638 cPtr += 8;
639 }
640
641 // Advance angle using Kahan summation
642 {
643 double y = block_delta - angle_c;
644 double t = angle_sum + y;
645 angle_c = (t - angle_sum) - y;
646 angle_sum = t;
647 }
648
649 // Resync: recompute phase from accumulated angle
650 for (unsigned int k = 0; k < 8; ++k) {
651 double a = angle_sum + (double)k * delta_angle;
652 REDUCE_ANGLE(a);
653 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
654 }
655 phase_Val = _mm512_load_ps((float*)phase_Ptr);
656 }
657
658 // Handle remainder
659 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 8; ++i) {
660 aVal = _mm512_load_ps((float*)aPtr);
661 z = _mm512_complexmul_ps(aVal, phase_Val);
662 phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
663 _mm512_store_ps((float*)cPtr, z);
664 aPtr += 8;
665 cPtr += 8;
666
667 {
668 double y = (8.0 * delta_angle) - angle_c;
669 double t = angle_sum + y;
670 angle_c = (t - angle_sum) - y;
671 angle_sum = t;
672 }
673 }
674
675 // Final phase output
676 {
677 double a = angle_sum;
678 REDUCE_ANGLE(a);
679 *phase = lv_cmake((float)cos(a), (float)sin(a));
680 }
681
682 // Scalar remainder
683 for (i = 0; i < num_points % 8; ++i) {
684 *cPtr++ = *aPtr++ * (*phase);
685 {
686 double y = delta_angle - angle_c;
687 double t = angle_sum + y;
688 angle_c = (t - angle_sum) - y;
689 angle_sum = t;
690 }
691 double a = angle_sum;
692 REDUCE_ANGLE(a);
693 *phase = lv_cmake((float)cos(a), (float)sin(a));
694 }
695
696#undef REDUCE_ANGLE
697}
698
699#endif /* LV_HAVE_AVX512F */
700
701
702#ifdef LV_HAVE_AVX512F
703#include <immintrin.h>
705
710static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx512f(lv_32fc_t* outVector,
711 const lv_32fc_t* inVector,
712 const lv_32fc_t* phase_inc,
713 lv_32fc_t* phase,
714 unsigned int num_points)
715{
716 lv_32fc_t* cPtr = outVector;
717 const lv_32fc_t* aPtr = inVector;
718
719 const double initial_angle =
720 atan2((double)lv_cimag(*phase), (double)lv_creal(*phase));
721 const double delta_angle =
722 atan2((double)lv_cimag(*phase_inc), (double)lv_creal(*phase_inc));
723
724 double angle_sum = initial_angle;
725 double angle_c = 0.0;
726
727 const double block_delta = (double)ROTATOR_RELOAD * delta_angle;
728
729 const double delta8 = 8.0 * delta_angle;
730 lv_32fc_t incr = lv_cmake((float)cos(delta8), (float)sin(delta8));
731
732 __m512 aVal, phase_Val, z;
733 lv_32fc_t phase_Ptr[8];
734
735 const __m512 inc_Val = _mm512_set_ps(lv_cimag(incr),
736 lv_creal(incr),
737 lv_cimag(incr),
738 lv_creal(incr),
739 lv_cimag(incr),
740 lv_creal(incr),
741 lv_cimag(incr),
742 lv_creal(incr),
743 lv_cimag(incr),
744 lv_creal(incr),
745 lv_cimag(incr),
746 lv_creal(incr),
747 lv_cimag(incr),
748 lv_creal(incr),
749 lv_cimag(incr),
750 lv_creal(incr));
751
752#define REDUCE_ANGLE(a) \
753 do { \
754 (a) = fmod((a), 2.0 * M_PI); \
755 if ((a) > M_PI) \
756 (a) -= 2.0 * M_PI; \
757 else if ((a) < -M_PI) \
758 (a) += 2.0 * M_PI; \
759 } while (0)
760
761 for (unsigned int k = 0; k < 8; ++k) {
762 double a = angle_sum + (double)k * delta_angle;
763 REDUCE_ANGLE(a);
764 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
765 }
766 phase_Val = _mm512_loadu_ps((float*)phase_Ptr);
767
768 unsigned int i, j;
769
770 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
771 for (j = 0; j < ROTATOR_RELOAD_8; ++j) {
772 aVal = _mm512_loadu_ps((float*)aPtr);
773 z = _mm512_complexmul_ps(aVal, phase_Val);
774 phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
775 _mm512_storeu_ps((float*)cPtr, z);
776 aPtr += 8;
777 cPtr += 8;
778 }
779
780 {
781 double y = block_delta - angle_c;
782 double t = angle_sum + y;
783 angle_c = (t - angle_sum) - y;
784 angle_sum = t;
785 }
786
787 for (unsigned int k = 0; k < 8; ++k) {
788 double a = angle_sum + (double)k * delta_angle;
789 REDUCE_ANGLE(a);
790 phase_Ptr[k] = lv_cmake((float)cos(a), (float)sin(a));
791 }
792 phase_Val = _mm512_loadu_ps((float*)phase_Ptr);
793 }
794
795 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 8; ++i) {
796 aVal = _mm512_loadu_ps((float*)aPtr);
797 z = _mm512_complexmul_ps(aVal, phase_Val);
798 phase_Val = _mm512_complexmul_ps(phase_Val, inc_Val);
799 _mm512_storeu_ps((float*)cPtr, z);
800 aPtr += 8;
801 cPtr += 8;
802
803 {
804 double y = (8.0 * delta_angle) - angle_c;
805 double t = angle_sum + y;
806 angle_c = (t - angle_sum) - y;
807 angle_sum = t;
808 }
809 }
810
811 {
812 double a = angle_sum;
813 REDUCE_ANGLE(a);
814 *phase = lv_cmake((float)cos(a), (float)sin(a));
815 }
816
817 for (i = 0; i < num_points % 8; ++i) {
818 *cPtr++ = *aPtr++ * (*phase);
819 {
820 double y = delta_angle - angle_c;
821 double t = angle_sum + y;
822 angle_c = (t - angle_sum) - y;
823 angle_sum = t;
824 }
825 double a = angle_sum;
826 REDUCE_ANGLE(a);
827 *phase = lv_cmake((float)cos(a), (float)sin(a));
828 }
829
830#undef REDUCE_ANGLE
831}
832
833#endif /* LV_HAVE_AVX512F */
834
835
836/* Note on the RVV implementation:
837 * The complex multiply was expanded, because we don't care about the corner cases.
838 * Otherwise, without -ffast-math, the compiler would inserts function calls,
839 * which invalidates all vector registers and spills them on each loop iteration. */
840
841#ifdef LV_HAVE_RVV
842#include <riscv_vector.h>
843
844static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvv(lv_32fc_t* outVector,
845 const lv_32fc_t* inVector,
846 const lv_32fc_t* phase_inc,
847 lv_32fc_t* phase,
848 unsigned int num_points)
849{
850 size_t vlmax = __riscv_vsetvlmax_e32m2();
851 vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
852
853 lv_32fc_t inc = 1.0f;
854 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
855 for (size_t i = 0; i < vlmax; ++i) {
856 lv_32fc_t ph =
857 lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
858 lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
859 phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
860 phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
861 inc = lv_cmake(
862 lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
863 lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
864 }
865 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
866 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
867
868 size_t vl = 0;
869 if (num_points > 0)
870 while (1) {
871 size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
872 num_points -= n;
873
874 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
875 // vl<vlmax can only happen on the last iteration of the loops
876 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
877
878 vuint64m4_t va = __riscv_vle64_v_u64m4((const uint64_t*)inVector, vl);
879 vfloat32m2_t var = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 0, vl));
880 vfloat32m2_t vai = __riscv_vreinterpret_f32m2(__riscv_vnsrl(va, 32, vl));
881
882 vfloat32m2_t vr =
883 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
884 vfloat32m2_t vi =
885 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
886
887 vuint32m2_t vru = __riscv_vreinterpret_u32m2(vr);
888 vuint32m2_t viu = __riscv_vreinterpret_u32m2(vi);
889 vuint64m4_t res =
890 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
891 __riscv_vse64((uint64_t*)outVector, res, vl);
892
893 vfloat32m2_t tmp = phr;
894 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
895 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
896 }
897
898 if (num_points <= 0)
899 break;
900
901 // normalize
902 vfloat32m2_t scale =
903 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
904 scale = __riscv_vfsqrt(scale, vl);
905 phr = __riscv_vfdiv(phr, scale, vl);
906 phi = __riscv_vfdiv(phi, scale, vl);
907 }
908
909 lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
910 for (size_t i = 0; i < vlmax - vl; ++i) {
911 ph /= *phase_inc; // we're going backwards
912 }
913 *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
914}
915#endif /*LV_HAVE_RVV*/
916
917#ifdef LV_HAVE_RVVSEG
918#include <riscv_vector.h>
919
920static inline void volk_32fc_s32fc_x2_rotator2_32fc_rvvseg(lv_32fc_t* outVector,
921 const lv_32fc_t* inVector,
922 const lv_32fc_t* phase_inc,
923 lv_32fc_t* phase,
924 unsigned int num_points)
925{
926 size_t vlmax = __riscv_vsetvlmax_e32m2();
927 vlmax = vlmax < ROTATOR_RELOAD ? vlmax : ROTATOR_RELOAD;
928
929 lv_32fc_t inc = 1.0f;
930 vfloat32m2_t phr = __riscv_vfmv_v_f_f32m2(0, vlmax), phi = phr;
931 for (size_t i = 0; i < vlmax; ++i) {
932 lv_32fc_t ph =
933 lv_cmake(lv_creal(*phase) * lv_creal(inc) - lv_cimag(*phase) * lv_cimag(inc),
934 lv_creal(*phase) * lv_cimag(inc) + lv_cimag(*phase) * lv_creal(inc));
935 phr = __riscv_vfslide1down(phr, lv_creal(ph), vlmax);
936 phi = __riscv_vfslide1down(phi, lv_cimag(ph), vlmax);
937 inc = lv_cmake(
938 lv_creal(*phase_inc) * lv_creal(inc) - lv_cimag(*phase_inc) * lv_cimag(inc),
939 lv_creal(*phase_inc) * lv_cimag(inc) + lv_cimag(*phase_inc) * lv_creal(inc));
940 }
941 vfloat32m2_t incr = __riscv_vfmv_v_f_f32m2(lv_creal(inc), vlmax);
942 vfloat32m2_t inci = __riscv_vfmv_v_f_f32m2(lv_cimag(inc), vlmax);
943
944 size_t vl = 0;
945 if (num_points > 0)
946 while (1) {
947 size_t n = num_points < ROTATOR_RELOAD ? num_points : ROTATOR_RELOAD;
948 num_points -= n;
949
950 for (; n > 0; n -= vl, inVector += vl, outVector += vl) {
951 // vl<vlmax can only happen on the last iteration of the loops
952 vl = __riscv_vsetvl_e32m2(n < vlmax ? n : vlmax);
953
954 vfloat32m2x2_t va =
955 __riscv_vlseg2e32_v_f32m2x2((const float*)inVector, vl);
956 vfloat32m2_t var = __riscv_vget_f32m2(va, 0);
957 vfloat32m2_t vai = __riscv_vget_f32m2(va, 1);
958
959 vfloat32m2_t vr =
960 __riscv_vfnmsac(__riscv_vfmul(var, phr, vl), vai, phi, vl);
961 vfloat32m2_t vi =
962 __riscv_vfmacc(__riscv_vfmul(var, phi, vl), vai, phr, vl);
963 vfloat32m2x2_t vc = __riscv_vcreate_v_f32m2x2(vr, vi);
964 __riscv_vsseg2e32_v_f32m2x2((float*)outVector, vc, vl);
965
966 vfloat32m2_t tmp = phr;
967 phr = __riscv_vfnmsac(__riscv_vfmul(tmp, incr, vl), phi, inci, vl);
968 phi = __riscv_vfmacc(__riscv_vfmul(tmp, inci, vl), phi, incr, vl);
969 }
970
971 if (num_points <= 0)
972 break;
973
974 // normalize
975 vfloat32m2_t scale =
976 __riscv_vfmacc(__riscv_vfmul(phr, phr, vl), phi, phi, vl);
977 scale = __riscv_vfsqrt(scale, vl);
978 phr = __riscv_vfdiv(phr, scale, vl);
979 phi = __riscv_vfdiv(phi, scale, vl);
980 }
981
982 lv_32fc_t ph = lv_cmake(__riscv_vfmv_f(phr), __riscv_vfmv_f(phi));
983 for (size_t i = 0; i < vlmax - vl; ++i) {
984 ph /= *phase_inc; // we're going backwards
985 }
986 *phase = ph * 1.0f / hypotf(lv_creal(ph), lv_cimag(ph));
987}
988#endif /*LV_HAVE_RVVSEG*/
989
990#endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */