Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
65
66#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
67#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
68
69#include <volk/volk_complex.h>
70
71
72static inline void calculate_scaled_distances(float* target,
73 const lv_32fc_t symbol,
74 const lv_32fc_t* points,
75 const float scalar,
76 const unsigned int num_points)
77{
78 lv_32fc_t diff;
79 for (unsigned int i = 0; i < num_points; ++i) {
80 /*
81 * Calculate: |y - x|^2 * SNR_lin
82 * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
83 */
84 diff = symbol - *points++;
85 *target++ =
86 scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
87 }
88}
89
90
91#ifdef LV_HAVE_AVX2
92#include <immintrin.h>
94
95static inline void
96volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
97 const lv_32fc_t* src0,
98 const lv_32fc_t* points,
99 float scalar,
100 unsigned int num_points)
101{
102 const unsigned int num_bytes = num_points * 8;
103 __m128 xmm9, xmm10;
104 __m256 xmm4, xmm6;
105 __m256 xmm_points0, xmm_points1, xmm_result;
106
107 const unsigned int bound = num_bytes >> 6;
108
109 // load complex value into all parts of the register.
110 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
111 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
112
113 // Load scalar into all 8 parts of the register
114 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
115 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
116
117 // Set permutation constant
118 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
119
120 for (unsigned int i = 0; i < bound; ++i) {
121 xmm_points0 = _mm256_load_ps((float*)points);
122 xmm_points1 = _mm256_load_ps((float*)(points + 4));
123 points += 8;
124 __VOLK_PREFETCH(points);
125
127 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
128
129 _mm256_store_ps(target, xmm_result);
130 target += 8;
131 }
132
133 if (num_bytes >> 5 & 1) {
134 xmm_points0 = _mm256_load_ps((float*)points);
135
136 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
137
138 points += 4;
139
140 xmm6 = _mm256_mul_ps(xmm4, xmm4);
141
142 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
143 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
144
145 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
146
147 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
148 _mm_store_ps(target, xmm9);
149 target += 4;
150 }
151
152 if (num_bytes >> 4 & 1) {
153 xmm9 = _mm_load_ps((float*)points);
154
155 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
156
157 points += 2;
158
159 xmm9 = _mm_mul_ps(xmm10, xmm10);
160
161 xmm10 = _mm_hadd_ps(xmm9, xmm9);
162
163 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
164
165 _mm_storeh_pi((__m64*)target, xmm10);
166 target += 2;
167 }
168
169 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
170}
171
172#endif /*LV_HAVE_AVX2*/
173
174
175#ifdef LV_HAVE_AVX
176#include <immintrin.h>
178
179static inline void
181 const lv_32fc_t* src0,
182 const lv_32fc_t* points,
183 float scalar,
184 unsigned int num_points)
185{
186 const int eightsPoints = num_points / 8;
187 const int remainder = num_points - 8 * eightsPoints;
188
189 __m256 xmm_points0, xmm_points1, xmm_result;
190
191 // load complex value into all parts of the register.
192 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
193
194 // Load scalar into all 8 parts of the register
195 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
196
197 for (int i = 0; i < eightsPoints; ++i) {
198 xmm_points0 = _mm256_load_ps((float*)points);
199 xmm_points1 = _mm256_load_ps((float*)(points + 4));
200 points += 8;
201
202 xmm_result = _mm256_scaled_norm_dist_ps(
203 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
204
205 _mm256_store_ps(target, xmm_result);
206 target += 8;
207 }
208
209 const lv_32fc_t symbol = *src0;
210 calculate_scaled_distances(target, symbol, points, scalar, remainder);
211}
212
213#endif /* LV_HAVE_AVX */
214
215
216#ifdef LV_HAVE_SSE3
217#include <pmmintrin.h>
219
220static inline void
222 const lv_32fc_t* src0,
223 const lv_32fc_t* points,
224 float scalar,
225 unsigned int num_points)
226{
227 __m128 xmm_points0, xmm_points1, xmm_result;
228
229 /*
230 * First do 4 values in every loop iteration.
231 * There may be up to 3 values left.
232 * leftovers0 indicates if at least 2 more are available for SSE execution.
233 * leftovers1 indicates if there is a single element left.
234 */
235 const int quarterPoints = num_points / 4;
236 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
237 const int leftovers1 = num_points % 2;
238
239 // load complex value into both parts of the register.
240 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
241
242 // Load scalar into all 4 parts of the register
243 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
244
245 for (int i = 0; i < quarterPoints; ++i) {
246 xmm_points0 = _mm_load_ps((float*)points);
247 xmm_points1 = _mm_load_ps((float*)(points + 2));
248 points += 4;
249 __VOLK_PREFETCH(points);
250 // calculate distances
251 xmm_result = _mm_scaled_norm_dist_ps_sse3(
252 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
253
254 _mm_store_ps(target, xmm_result);
255 target += 4;
256 }
257
258 for (int i = 0; i < leftovers0; ++i) {
259 xmm_points0 = _mm_load_ps((float*)points);
260 points += 2;
261
262 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
263 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
264 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
265 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
266
267 _mm_storeh_pi((__m64*)target, xmm_result);
268 target += 2;
269 }
270
271 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
272}
273
274#endif /*LV_HAVE_SSE3*/
275
276#ifdef LV_HAVE_SSE
278#include <xmmintrin.h>
279static inline void
281 const lv_32fc_t* src0,
282 const lv_32fc_t* points,
283 float scalar,
284 unsigned int num_points)
285{
286 const __m128 xmm_scalar = _mm_set1_ps(scalar);
287 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
288
289 for (unsigned i = 0; i < num_points / 4; ++i) {
290 __m128 xmm_points0 = _mm_load_ps((float*)points);
291 __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
292 points += 4;
293 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
294 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
295 _mm_store_ps((float*)target, xmm_result);
296 target += 4;
297 }
298
299 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
300}
301#endif // LV_HAVE_SSE
302
303#ifdef LV_HAVE_GENERIC
304static inline void
306 const lv_32fc_t* src0,
307 const lv_32fc_t* points,
308 float scalar,
309 unsigned int num_points)
310{
311 const lv_32fc_t symbol = *src0;
312 calculate_scaled_distances(target, symbol, points, scalar, num_points);
313}
314
315#endif /*LV_HAVE_GENERIC*/
316
317#ifdef LV_HAVE_NEON
318#include <arm_neon.h>
319
320static inline void
322 const lv_32fc_t* src0,
323 const lv_32fc_t* points,
324 float scalar,
325 unsigned int num_points)
326{
327 unsigned int number = 0;
328 const unsigned int quarterPoints = num_points / 4;
329
330 // Load the reference symbol real and imag into vectors
331 const float32x4_t symbolReal = vdupq_n_f32(lv_creal(*src0));
332 const float32x4_t symbolImag = vdupq_n_f32(lv_cimag(*src0));
333 const float32x4_t vScalar = vdupq_n_f32(scalar);
334
335 for (; number < quarterPoints; number++) {
336 // Load 4 complex points (8 floats) and deinterleave
337 float32x4x2_t pts = vld2q_f32((const float*)points);
338 points += 4;
339
340 // Calculate difference
341 float32x4_t diffReal = vsubq_f32(symbolReal, pts.val[0]);
342 float32x4_t diffImag = vsubq_f32(symbolImag, pts.val[1]);
343
344 // Calculate squared magnitude and scale
345 float32x4_t result = vmulq_f32(diffReal, diffReal);
346 result = vmlaq_f32(result, diffImag, diffImag);
347 result = vmulq_f32(result, vScalar);
348
349 vst1q_f32(target, result);
350 target += 4;
351 }
352
353 // Handle remaining points
355 target, *src0, points, scalar, num_points - quarterPoints * 4);
356}
357
358#endif /*LV_HAVE_NEON*/
359
360#ifdef LV_HAVE_NEONV8
361#include <arm_neon.h>
362
363static inline void
364volk_32fc_x2_s32f_square_dist_scalar_mult_32f_neonv8(float* target,
365 const lv_32fc_t* src0,
366 const lv_32fc_t* points,
367 float scalar,
368 unsigned int num_points)
369{
370 unsigned int number = 0;
371 const unsigned int eighthPoints = num_points / 8;
372
373 // Load the reference symbol real and imag into vectors
374 const float32x4_t symbolReal = vdupq_n_f32(lv_creal(*src0));
375 const float32x4_t symbolImag = vdupq_n_f32(lv_cimag(*src0));
376 const float32x4_t vScalar = vdupq_n_f32(scalar);
377
378 for (; number < eighthPoints; number++) {
379 __VOLK_PREFETCH(points + 16);
380
381 // Load 8 complex points (16 floats) and deinterleave
382 float32x4x2_t pts0 = vld2q_f32((const float*)points);
383 float32x4x2_t pts1 = vld2q_f32((const float*)(points + 4));
384 points += 8;
385
386 // Calculate difference
387 float32x4_t diffReal0 = vsubq_f32(symbolReal, pts0.val[0]);
388 float32x4_t diffImag0 = vsubq_f32(symbolImag, pts0.val[1]);
389 float32x4_t diffReal1 = vsubq_f32(symbolReal, pts1.val[0]);
390 float32x4_t diffImag1 = vsubq_f32(symbolImag, pts1.val[1]);
391
392 // Calculate squared magnitude: real^2 + imag^2 using FMA
393 float32x4_t result0 =
394 vfmaq_f32(vmulq_f32(diffReal0, diffReal0), diffImag0, diffImag0);
395 float32x4_t result1 =
396 vfmaq_f32(vmulq_f32(diffReal1, diffReal1), diffImag1, diffImag1);
397
398 // Scale
399 result0 = vmulq_f32(result0, vScalar);
400 result1 = vmulq_f32(result1, vScalar);
401
402 vst1q_f32(target, result0);
403 vst1q_f32(target + 4, result1);
404 target += 8;
405 }
406
407 // Handle remaining points
408 const unsigned int remaining = num_points - eighthPoints * 8;
409 calculate_scaled_distances(target, *src0, points, scalar, remaining);
410}
411
412#endif /*LV_HAVE_NEONV8*/
413
414#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
415
416#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
417#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
418
419#include <volk/volk_complex.h>
420
421
422#ifdef LV_HAVE_AVX2
423#include <immintrin.h>
425
426static inline void
427volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
428 const lv_32fc_t* src0,
429 const lv_32fc_t* points,
430 float scalar,
431 unsigned int num_points)
432{
433 const unsigned int num_bytes = num_points * 8;
434 __m128 xmm9, xmm10;
435 __m256 xmm4, xmm6;
436 __m256 xmm_points0, xmm_points1, xmm_result;
437
438 const unsigned int bound = num_bytes >> 6;
439
440 // load complex value into all parts of the register.
441 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
442 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
443
444 // Load scalar into all 8 parts of the register
445 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
446 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
447
448 // Set permutation constant
449 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
450
451 for (unsigned int i = 0; i < bound; ++i) {
452 xmm_points0 = _mm256_loadu_ps((float*)points);
453 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
454 points += 8;
455 __VOLK_PREFETCH(points);
456
458 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
459
460 _mm256_storeu_ps(target, xmm_result);
461 target += 8;
462 }
463
464 if (num_bytes >> 5 & 1) {
465 xmm_points0 = _mm256_loadu_ps((float*)points);
466
467 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
468
469 points += 4;
470
471 xmm6 = _mm256_mul_ps(xmm4, xmm4);
472
473 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
474 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
475
476 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
477
478 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
479 _mm_storeu_ps(target, xmm9);
480 target += 4;
481 }
482
483 if (num_bytes >> 4 & 1) {
484 xmm9 = _mm_loadu_ps((float*)points);
485
486 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
487
488 points += 2;
489
490 xmm9 = _mm_mul_ps(xmm10, xmm10);
491
492 xmm10 = _mm_hadd_ps(xmm9, xmm9);
493
494 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
495
496 _mm_storeh_pi((__m64*)target, xmm10);
497 target += 2;
498 }
499
500 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
501}
502
503#endif /*LV_HAVE_AVX2*/
504
505
506#ifdef LV_HAVE_AVX
507#include <immintrin.h>
509
510static inline void
512 const lv_32fc_t* src0,
513 const lv_32fc_t* points,
514 float scalar,
515 unsigned int num_points)
516{
517 const int eightsPoints = num_points / 8;
518 const int remainder = num_points - 8 * eightsPoints;
519
520 __m256 xmm_points0, xmm_points1, xmm_result;
521
522 // load complex value into all parts of the register.
523 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
524
525 // Load scalar into all 8 parts of the register
526 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
527
528 for (int i = 0; i < eightsPoints; ++i) {
529 xmm_points0 = _mm256_loadu_ps((float*)points);
530 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
531 points += 8;
532
533 xmm_result = _mm256_scaled_norm_dist_ps(
534 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
535
536 _mm256_storeu_ps(target, xmm_result);
537 target += 8;
538 }
539
540 const lv_32fc_t symbol = *src0;
541 calculate_scaled_distances(target, symbol, points, scalar, remainder);
542}
543
544#endif /* LV_HAVE_AVX */
545
546
547#ifdef LV_HAVE_SSE3
548#include <pmmintrin.h>
550
551static inline void
553 const lv_32fc_t* src0,
554 const lv_32fc_t* points,
555 float scalar,
556 unsigned int num_points)
557{
558 __m128 xmm_points0, xmm_points1, xmm_result;
559
560 /*
561 * First do 4 values in every loop iteration.
562 * There may be up to 3 values left.
563 * leftovers0 indicates if at least 2 more are available for SSE execution.
564 * leftovers1 indicates if there is a single element left.
565 */
566 const int quarterPoints = num_points / 4;
567 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
568 const int leftovers1 = num_points % 2;
569
570 // load complex value into both parts of the register.
571 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
572
573 // Load scalar into all 4 parts of the register
574 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
575
576 for (int i = 0; i < quarterPoints; ++i) {
577 xmm_points0 = _mm_loadu_ps((float*)points);
578 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
579 points += 4;
580 __VOLK_PREFETCH(points);
581 // calculate distances
582 xmm_result = _mm_scaled_norm_dist_ps_sse3(
583 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
584
585 _mm_storeu_ps(target, xmm_result);
586 target += 4;
587 }
588
589 for (int i = 0; i < leftovers0; ++i) {
590 xmm_points0 = _mm_loadu_ps((float*)points);
591 points += 2;
592
593 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
594 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
595 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
596 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
597
598 _mm_storeh_pi((__m64*)target, xmm_result);
599 target += 2;
600 }
601
602 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
603}
604
605#endif /*LV_HAVE_SSE3*/
606
607#ifdef LV_HAVE_SSE
609#include <xmmintrin.h>
610static inline void
612 const lv_32fc_t* src0,
613 const lv_32fc_t* points,
614 float scalar,
615 unsigned int num_points)
616{
617 const __m128 xmm_scalar = _mm_set1_ps(scalar);
618 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
619
620 for (unsigned i = 0; i < num_points / 4; ++i) {
621 __m128 xmm_points0 = _mm_loadu_ps((float*)points);
622 __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
623 points += 4;
624 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
625 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
626 _mm_storeu_ps((float*)target, xmm_result);
627 target += 4;
628 }
629
630 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
631}
632#endif // LV_HAVE_SSE
633
634#ifdef LV_HAVE_RVV
635#include <riscv_vector.h>
636
637static inline void
638volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvv(float* target,
639 const lv_32fc_t* src0,
640 const lv_32fc_t* points,
641 float scalar,
642 unsigned int num_points)
643{
644 size_t vlmax = __riscv_vsetvlmax_e32m4();
645 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
646 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
647 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
648
649 size_t n = num_points;
650 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
651 vl = __riscv_vsetvl_e32m4(n);
652 vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
653 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
654 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
655 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
656 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
657 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
658 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
659 }
660}
661#endif /*LV_HAVE_RVV*/
662
663#ifdef LV_HAVE_RVVSEG
664#include <riscv_vector.h>
665
666static inline void
667volk_32fc_x2_s32f_square_dist_scalar_mult_32f_rvvseg(float* target,
668 const lv_32fc_t* src0,
669 const lv_32fc_t* points,
670 float scalar,
671 unsigned int num_points)
672{
673 size_t vlmax = __riscv_vsetvlmax_e32m4();
674 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
675 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
676 vfloat32m4_t vscale = __riscv_vfmv_v_f_f32m4(scalar, vlmax);
677
678 size_t n = num_points;
679 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
680 vl = __riscv_vsetvl_e32m4(n);
681 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
682 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
683 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
684 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
685 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
686 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
687 __riscv_vse32(target, __riscv_vfmul(v, vscale, vl), vl);
688 }
689}
690#endif /*LV_HAVE_RVVSEG*/
691
692#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/