Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 * Copyright 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
97
98#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
99#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
100
101#include <float.h>
102#include <inttypes.h>
103#include <volk/volk_complex.h>
104
105
106#ifdef LV_HAVE_GENERIC
107
108static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
109 const lv_32fc_t* aVector,
110 const lv_32fc_t* bVector,
111 unsigned int num_points)
112{
113 lv_32fc_t* cPtr = cVector;
114 const lv_32fc_t* aPtr = aVector;
115 const lv_32fc_t* bPtr = bVector;
116
117 for (unsigned int number = 0; number < num_points; number++) {
118 *cPtr++ = (*aPtr++) / (*bPtr++);
119 }
120}
121#endif /* LV_HAVE_GENERIC */
122
123
124#ifdef LV_HAVE_SSE3
125#include <pmmintrin.h>
127
128static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
129 const lv_32fc_t* numeratorVector,
130 const lv_32fc_t* denumeratorVector,
131 unsigned int num_points)
132{
133 /*
134 * we'll do the "classical"
135 * a a b*
136 * --- = -------
137 * b |b|^2
138 * */
139 unsigned int number = 0;
140 const unsigned int quarterPoints = num_points / 4;
141
142 __m128 num01, num23, den01, den23, norm, result;
143 lv_32fc_t* c = cVector;
144 const lv_32fc_t* a = numeratorVector;
145 const lv_32fc_t* b = denumeratorVector;
146
147 for (; number < quarterPoints; number++) {
148 num01 = _mm_loadu_ps((float*)a); // first pair
149 den01 = _mm_loadu_ps((float*)b); // first pair
150 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
151 a += 2;
152 b += 2;
153
154 num23 = _mm_loadu_ps((float*)a); // second pair
155 den23 = _mm_loadu_ps((float*)b); // second pair
156 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
157 a += 2;
158 b += 2;
159
160 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
161 den01 = _mm_unpacklo_ps(norm, norm);
162 den23 = _mm_unpackhi_ps(norm, norm);
163
164 result = _mm_div_ps(num01, den01);
165 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
166 c += 2;
167 result = _mm_div_ps(num23, den23);
168 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
169 c += 2;
170 }
171
172 number *= 4;
173 for (; number < num_points; number++) {
174 *c = (*a) / (*b);
175 a++;
176 b++;
177 c++;
178 }
179}
180#endif /* LV_HAVE_SSE3 */
181
182
183#ifdef LV_HAVE_AVX
184#include <immintrin.h>
186
187static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
188 const lv_32fc_t* numeratorVector,
189 const lv_32fc_t* denumeratorVector,
190 unsigned int num_points)
191{
192 /*
193 * we'll do the "classical"
194 * a a b*
195 * --- = -------
196 * b |b|^2
197 * */
198 unsigned int number = 0;
199 const unsigned int quarterPoints = num_points / 4;
200
201 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
202 lv_32fc_t* c = cVector;
203 const lv_32fc_t* a = numeratorVector;
204 const lv_32fc_t* b = denumeratorVector;
205
206 for (; number < quarterPoints; number++) {
207 num = _mm256_loadu_ps(
208 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
209 denum = _mm256_loadu_ps(
210 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
211 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
212 sq = _mm256_mul_ps(denum, denum); // Square the values
213 mag_sq_un = _mm256_hadd_ps(
214 sq, sq); // obtain the actual squared magnitude, although out of order
215 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
216 // best guide I found on using these functions:
217 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
218 div = _mm256_div_ps(mul_conj, mag_sq);
219
220 _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
221
222 a += 4;
223 b += 4;
224 c += 4;
225 }
226
227 number = quarterPoints * 4;
228
229 for (; number < num_points; number++) {
230 *c++ = (*a++) / (*b++);
231 }
232}
233#endif /* LV_HAVE_AVX */
234
235#if LV_HAVE_AVX2 && LV_HAVE_FMA
236#include <immintrin.h>
238#include <volk/volk_complex.h>
239
240static inline void volk_32fc_x2_divide_32fc_u_avx2_fma(lv_32fc_t* cVector,
241 const lv_32fc_t* numeratorVector,
242 const lv_32fc_t* denumeratorVector,
243 unsigned int num_points)
244{
245 lv_32fc_t* c = cVector;
246 const lv_32fc_t* a = numeratorVector;
247 const lv_32fc_t* b = denumeratorVector;
248
249 const unsigned int eighthPoints = num_points / 8;
250
251 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
252
253 for (unsigned int number = 0; number < eighthPoints; number++) {
254 num01 = _mm256_loadu_ps((float*)a);
255 denum01 = _mm256_loadu_ps((float*)b);
256 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
257 a += 4;
258 b += 4;
259
260 num23 = _mm256_loadu_ps((float*)a);
261 denum23 = _mm256_loadu_ps((float*)b);
262 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
263 a += 4;
264 b += 4;
265
266 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
267 _mm256_mul_ps(denum23, denum23));
268
269 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
270 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
271
272 result0 = _mm256_div_ps(num01, denum01);
273 result1 = _mm256_div_ps(num23, denum23);
274
275 _mm256_storeu_ps((float*)c, result0);
276 c += 4;
277 _mm256_storeu_ps((float*)c, result1);
278 c += 4;
279 }
280
281 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eighthPoints * 8);
282}
283#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
284
285#ifdef LV_HAVE_AVX512F
286#include <immintrin.h>
288#include <volk/volk_complex.h>
289
290static inline void volk_32fc_x2_divide_32fc_u_avx512(lv_32fc_t* cVector,
291 const lv_32fc_t* numeratorVector,
292 const lv_32fc_t* denumeratorVector,
293 unsigned int num_points)
294{
295 lv_32fc_t* c = cVector;
296 const lv_32fc_t* a = numeratorVector;
297 const lv_32fc_t* b = denumeratorVector;
298
299 const unsigned int sixteenthPoints = num_points / 16;
300
301 __m512 num01, num23, denum01, denum23;
302 __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
303 __m512 result0, result1;
304
305 for (unsigned int number = 0; number < sixteenthPoints; number++) {
306 num01 = _mm512_loadu_ps((float*)a);
307 denum01 = _mm512_loadu_ps((float*)b);
308 num01 = _mm512_complexconjugatemul_ps(num01, denum01);
309 a += 8;
310 b += 8;
311
312 num23 = _mm512_loadu_ps((float*)a);
313 denum23 = _mm512_loadu_ps((float*)b);
314 num23 = _mm512_complexconjugatemul_ps(num23, denum23);
315 a += 8;
316 b += 8;
317
318 // Compute magnitude squared for both sets
319 mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
320 mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
321 _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
322
323 mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
324 mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
325 _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
326
327 result0 = _mm512_div_ps(num01, mag_sq01);
328 result1 = _mm512_div_ps(num23, mag_sq23);
329
330 _mm512_storeu_ps((float*)c, result0);
331 c += 8;
332 _mm512_storeu_ps((float*)c, result1);
333 c += 8;
334 }
335
336 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - sixteenthPoints * 16);
337}
338#endif /* LV_HAVE_AVX512F */
339
340
341#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
342
343
344#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
345#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
346
347#include <float.h>
348#include <inttypes.h>
349#include <stdio.h>
350#include <volk/volk_complex.h>
351
352#ifdef LV_HAVE_SSE3
353#include <pmmintrin.h>
355
356static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
357 const lv_32fc_t* numeratorVector,
358 const lv_32fc_t* denumeratorVector,
359 unsigned int num_points)
360{
361 /*
362 * we'll do the "classical"
363 * a a b*
364 * --- = -------
365 * b |b|^2
366 * */
367 unsigned int number = 0;
368 const unsigned int quarterPoints = num_points / 4;
369
370 __m128 num01, num23, den01, den23, norm, result;
371 lv_32fc_t* c = cVector;
372 const lv_32fc_t* a = numeratorVector;
373 const lv_32fc_t* b = denumeratorVector;
374
375 for (; number < quarterPoints; number++) {
376 num01 = _mm_load_ps((float*)a); // first pair
377 den01 = _mm_load_ps((float*)b); // first pair
378 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
379 a += 2;
380 b += 2;
381
382 num23 = _mm_load_ps((float*)a); // second pair
383 den23 = _mm_load_ps((float*)b); // second pair
384 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
385 a += 2;
386 b += 2;
387
388 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
389
390 den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
391 den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
392
393 result = _mm_div_ps(num01, den01);
394 _mm_store_ps((float*)c, result); // Store the results back into the C container
395 c += 2;
396 result = _mm_div_ps(num23, den23);
397 _mm_store_ps((float*)c, result); // Store the results back into the C container
398 c += 2;
399 }
400
401 number *= 4;
402 for (; number < num_points; number++) {
403 *c = (*a) / (*b);
404 a++;
405 b++;
406 c++;
407 }
408}
409#endif /* LV_HAVE_SSE */
410
411#ifdef LV_HAVE_AVX
412#include <immintrin.h>
414
415static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
416 const lv_32fc_t* numeratorVector,
417 const lv_32fc_t* denumeratorVector,
418 unsigned int num_points)
419{
420 /*
421 * Guide to AVX intrisics:
422 * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
423 *
424 * we'll do the "classical"
425 * a a b*
426 * --- = -------
427 * b |b|^2
428 *
429 */
430 lv_32fc_t* c = cVector;
431 const lv_32fc_t* a = numeratorVector;
432 const lv_32fc_t* b = denumeratorVector;
433
434 const unsigned int eigthPoints = num_points / 8;
435
436 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
437
438 for (unsigned int number = 0; number < eigthPoints; number++) {
439 // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
440 num01 = _mm256_load_ps((float*)a);
441 denum01 = _mm256_load_ps((float*)b);
442
443 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
444 a += 4;
445 b += 4;
446
447 num23 = _mm256_load_ps((float*)a);
448 denum23 = _mm256_load_ps((float*)b);
449 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
450 a += 4;
451 b += 4;
452
453 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
454 _mm256_mul_ps(denum23, denum23));
455
456 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
457 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
458
459 result0 = _mm256_div_ps(num01, denum01);
460 result1 = _mm256_div_ps(num23, denum23);
461
462 _mm256_store_ps((float*)c, result0);
463 c += 4;
464 _mm256_store_ps((float*)c, result1);
465 c += 4;
466 }
467
468 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
469}
470#endif /* LV_HAVE_AVX */
471
472#if LV_HAVE_AVX2 && LV_HAVE_FMA
473#include <immintrin.h>
475#include <volk/volk_complex.h>
476
477static inline void volk_32fc_x2_divide_32fc_a_avx2_fma(lv_32fc_t* cVector,
478 const lv_32fc_t* numeratorVector,
479 const lv_32fc_t* denumeratorVector,
480 unsigned int num_points)
481{
482 lv_32fc_t* c = cVector;
483 const lv_32fc_t* a = numeratorVector;
484 const lv_32fc_t* b = denumeratorVector;
485
486 const unsigned int eighthPoints = num_points / 8;
487
488 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
489
490 for (unsigned int number = 0; number < eighthPoints; number++) {
491 num01 = _mm256_load_ps((float*)a);
492 denum01 = _mm256_load_ps((float*)b);
493 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
494 a += 4;
495 b += 4;
496
497 num23 = _mm256_load_ps((float*)a);
498 denum23 = _mm256_load_ps((float*)b);
499 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
500 a += 4;
501 b += 4;
502
503 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
504 _mm256_mul_ps(denum23, denum23));
505
506 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
507 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
508
509 result0 = _mm256_div_ps(num01, denum01);
510 result1 = _mm256_div_ps(num23, denum23);
511
512 _mm256_store_ps((float*)c, result0);
513 c += 4;
514 _mm256_store_ps((float*)c, result1);
515 c += 4;
516 }
517
518 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eighthPoints * 8);
519}
520#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
521
522#ifdef LV_HAVE_AVX512F
523#include <immintrin.h>
525#include <volk/volk_complex.h>
526
527static inline void volk_32fc_x2_divide_32fc_a_avx512(lv_32fc_t* cVector,
528 const lv_32fc_t* numeratorVector,
529 const lv_32fc_t* denumeratorVector,
530 unsigned int num_points)
531{
532 lv_32fc_t* c = cVector;
533 const lv_32fc_t* a = numeratorVector;
534 const lv_32fc_t* b = denumeratorVector;
535
536 const unsigned int sixteenthPoints = num_points / 16;
537
538 __m512 num01, num23, denum01, denum23;
539 __m512 mag_sq01_shuf, mag_sq23_shuf, mag_sq01, mag_sq23;
540 __m512 result0, result1;
541
542 for (unsigned int number = 0; number < sixteenthPoints; number++) {
543 num01 = _mm512_load_ps((float*)a);
544 denum01 = _mm512_load_ps((float*)b);
545 num01 = _mm512_complexconjugatemul_ps(num01, denum01);
546 a += 8;
547 b += 8;
548
549 num23 = _mm512_load_ps((float*)a);
550 denum23 = _mm512_load_ps((float*)b);
551 num23 = _mm512_complexconjugatemul_ps(num23, denum23);
552 a += 8;
553 b += 8;
554
555 // Compute magnitude squared for both sets
556 mag_sq01_shuf = _mm512_shuffle_ps(denum01, denum01, 0xb1);
557 mag_sq01 = _mm512_add_ps(_mm512_mul_ps(denum01, denum01),
558 _mm512_mul_ps(mag_sq01_shuf, mag_sq01_shuf));
559
560 mag_sq23_shuf = _mm512_shuffle_ps(denum23, denum23, 0xb1);
561 mag_sq23 = _mm512_add_ps(_mm512_mul_ps(denum23, denum23),
562 _mm512_mul_ps(mag_sq23_shuf, mag_sq23_shuf));
563
564 result0 = _mm512_div_ps(num01, mag_sq01);
565 result1 = _mm512_div_ps(num23, mag_sq23);
566
567 _mm512_store_ps((float*)c, result0);
568 c += 8;
569 _mm512_store_ps((float*)c, result1);
570 c += 8;
571 }
572
573 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - sixteenthPoints * 16);
574}
575#endif /* LV_HAVE_AVX512F */
576
577
578#ifdef LV_HAVE_NEON
579#include <arm_neon.h>
580
581static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
582 const lv_32fc_t* aVector,
583 const lv_32fc_t* bVector,
584 unsigned int num_points)
585{
586 lv_32fc_t* cPtr = cVector;
587 const lv_32fc_t* aPtr = aVector;
588 const lv_32fc_t* bPtr = bVector;
589
590 float32x4x2_t aVal, bVal, cVal;
591 float32x4_t bAbs, bAbsInv;
592
593 const unsigned int quarterPoints = num_points / 4;
594 unsigned int number = 0;
595 for (; number < quarterPoints; number++) {
596 aVal = vld2q_f32((const float*)(aPtr));
597 bVal = vld2q_f32((const float*)(bPtr));
598 aPtr += 4;
599 bPtr += 4;
600 __VOLK_PREFETCH(aPtr + 4);
601 __VOLK_PREFETCH(bPtr + 4);
602
603 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
604 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
605
606 bAbsInv = vrecpeq_f32(bAbs);
607 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
608 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
609
610 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
611 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
612 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
613
614 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
615 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
616 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
617
618 vst2q_f32((float*)(cPtr), cVal);
619 cPtr += 4;
620 }
621
622 for (number = quarterPoints * 4; number < num_points; number++) {
623 *cPtr++ = (*aPtr++) / (*bPtr++);
624 }
625}
626#endif /* LV_HAVE_NEON */
627
628#ifdef LV_HAVE_NEONV8
629#include <arm_neon.h>
630
631static inline void volk_32fc_x2_divide_32fc_neonv8(lv_32fc_t* cVector,
632 const lv_32fc_t* aVector,
633 const lv_32fc_t* bVector,
634 unsigned int num_points)
635{
636 lv_32fc_t* cPtr = cVector;
637 const lv_32fc_t* aPtr = aVector;
638 const lv_32fc_t* bPtr = bVector;
639
640 float32x4x2_t aVal, bVal, cVal;
641 float32x4_t bMagSq;
642
643 const unsigned int quarterPoints = num_points / 4;
644 unsigned int number = 0;
645
646 for (; number < quarterPoints; number++) {
647 aVal = vld2q_f32((const float*)(aPtr));
648 bVal = vld2q_f32((const float*)(bPtr));
649 aPtr += 4;
650 bPtr += 4;
651 __VOLK_PREFETCH(aPtr + 4);
652 __VOLK_PREFETCH(bPtr + 4);
653
654 /* Compute |b|^2 = br^2 + bi^2 using FMA */
655 bMagSq = vfmaq_f32(vmulq_f32(bVal.val[0], bVal.val[0]), bVal.val[1], bVal.val[1]);
656
657 /* Use ARMv8 native division for 1/|b|^2 */
658 float32x4_t bMagSqInv = vdivq_f32(vdupq_n_f32(1.0f), bMagSq);
659
660 /* real = (ar*br + ai*bi) / |b|^2 */
661 cVal.val[0] =
662 vfmaq_f32(vmulq_f32(aVal.val[0], bVal.val[0]), aVal.val[1], bVal.val[1]);
663 cVal.val[0] = vmulq_f32(cVal.val[0], bMagSqInv);
664
665 /* imag = (ai*br - ar*bi) / |b|^2 */
666 cVal.val[1] =
667 vfmsq_f32(vmulq_f32(aVal.val[1], bVal.val[0]), aVal.val[0], bVal.val[1]);
668 cVal.val[1] = vmulq_f32(cVal.val[1], bMagSqInv);
669
670 vst2q_f32((float*)(cPtr), cVal);
671 cPtr += 4;
672 }
673
674 for (number = quarterPoints * 4; number < num_points; number++) {
675 *cPtr++ = (*aPtr++) / (*bPtr++);
676 }
677}
678#endif /* LV_HAVE_NEONV8 */
679
680#ifdef LV_HAVE_RVV
681#include <riscv_vector.h>
682
683
684static inline void volk_32fc_x2_divide_32fc_rvv(lv_32fc_t* cVector,
685 const lv_32fc_t* aVector,
686 const lv_32fc_t* bVector,
687 unsigned int num_points)
688{
689 uint64_t* out = (uint64_t*)cVector;
690 size_t n = num_points;
691 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, out += vl) {
692 vl = __riscv_vsetvl_e32m4(n);
693 vuint64m8_t va = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
694 vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)bVector, vl);
695 vfloat32m4_t var = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 0, vl));
696 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
697 vfloat32m4_t vai = __riscv_vreinterpret_f32m4(__riscv_vnsrl(va, 32, vl));
698 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
699 vfloat32m4_t mul = __riscv_vfrdiv(
700 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
701 vfloat32m4_t vr = __riscv_vfmul(
702 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
703 vfloat32m4_t vi = __riscv_vfmul(
704 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
705 vuint32m4_t vru = __riscv_vreinterpret_u32m4(vr);
706 vuint32m4_t viu = __riscv_vreinterpret_u32m4(vi);
707 vuint64m8_t v =
708 __riscv_vwmaccu(__riscv_vwaddu_vv(vru, viu, vl), 0xFFFFFFFF, viu, vl);
709 __riscv_vse64(out, v, vl);
710 }
711}
712#endif /*LV_HAVE_RVV*/
713
714#ifdef LV_HAVE_RVVSEG
715#include <riscv_vector.h>
716
717static inline void volk_32fc_x2_divide_32fc_rvvseg(lv_32fc_t* cVector,
718 const lv_32fc_t* aVector,
719 const lv_32fc_t* bVector,
720 unsigned int num_points)
721{
722 size_t n = num_points;
723 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
724 vl = __riscv_vsetvl_e32m4(n);
725 vfloat32m4x2_t va = __riscv_vlseg2e32_v_f32m4x2((const float*)aVector, vl);
726 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)bVector, vl);
727 vfloat32m4_t var = __riscv_vget_f32m4(va, 0), vai = __riscv_vget_f32m4(va, 1);
728 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0), vbi = __riscv_vget_f32m4(vb, 1);
729 vfloat32m4_t mul = __riscv_vfrdiv(
730 __riscv_vfmacc(__riscv_vfmul(vbi, vbi, vl), vbr, vbr, vl), 1.0f, vl);
731 vfloat32m4_t vr = __riscv_vfmul(
732 __riscv_vfmacc(__riscv_vfmul(var, vbr, vl), vai, vbi, vl), mul, vl);
733 vfloat32m4_t vi = __riscv_vfmul(
734 __riscv_vfnmsac(__riscv_vfmul(vai, vbr, vl), var, vbi, vl), mul, vl);
735 __riscv_vsseg2e32_v_f32m4x2(
736 (float*)cVector, __riscv_vcreate_v_f32m4x2(vr, vi), vl);
737 }
738}
739
740#endif /*LV_HAVE_RVVSEG*/
741
742#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */