Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_s32f_magnitude_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
61#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
62
63#include <inttypes.h>
64#include <math.h>
65#include <stdio.h>
66#include <volk/volk_common.h>
67
68#ifdef LV_HAVE_GENERIC
69
70static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
71 const lv_32fc_t* complexVector,
72 const float scalar,
73 unsigned int num_points)
74{
75 const float* complexVectorPtr = (float*)complexVector;
76 int16_t* magnitudeVectorPtr = magnitudeVector;
77 unsigned int number = 0;
78 for (number = 0; number < num_points; number++) {
79 float real = *complexVectorPtr++;
80 float imag = *complexVectorPtr++;
81 *magnitudeVectorPtr++ =
82 (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
83 }
84}
85#endif /* LV_HAVE_GENERIC */
86
87#ifdef LV_HAVE_AVX2
88#include <immintrin.h>
89
90static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
91 const lv_32fc_t* complexVector,
92 const float scalar,
93 unsigned int num_points)
94{
95 unsigned int number = 0;
96 const unsigned int eighthPoints = num_points / 8;
97
98 const float* complexVectorPtr = (const float*)complexVector;
99 int16_t* magnitudeVectorPtr = magnitudeVector;
100
101 __m256 vScalar = _mm256_set1_ps(scalar);
102 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
103 __m256 cplxValue1, cplxValue2, result;
104 __m256i resultInt;
105 __m128i resultShort;
106
107 for (; number < eighthPoints; number++) {
108 cplxValue1 = _mm256_load_ps(complexVectorPtr);
109 complexVectorPtr += 8;
110
111 cplxValue2 = _mm256_load_ps(complexVectorPtr);
112 complexVectorPtr += 8;
113
114 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
115 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
116
117 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
118
119 result = _mm256_sqrt_ps(result);
120
121 result = _mm256_mul_ps(result, vScalar);
122
123 resultInt = _mm256_cvtps_epi32(result);
124 resultInt = _mm256_packs_epi32(resultInt, resultInt);
125 resultInt = _mm256_permutevar8x32_epi32(
126 resultInt, idx); // permute to compensate for shuffling in hadd and packs
127 resultShort = _mm256_extracti128_si256(resultInt, 0);
128 _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
129 magnitudeVectorPtr += 8;
130 }
131
132 number = eighthPoints * 8;
134 magnitudeVector + number, complexVector + number, scalar, num_points - number);
135}
136#endif /* LV_HAVE_AVX2 */
137
138#ifdef LV_HAVE_SSE3
139#include <pmmintrin.h>
140
141static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
142 const lv_32fc_t* complexVector,
143 const float scalar,
144 unsigned int num_points)
145{
146 unsigned int number = 0;
147 const unsigned int quarterPoints = num_points / 4;
148
149 const float* complexVectorPtr = (const float*)complexVector;
150 int16_t* magnitudeVectorPtr = magnitudeVector;
151
152 __m128 vScalar = _mm_set_ps1(scalar);
153
154 __m128 cplxValue1, cplxValue2, result;
155
156 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
157
158 for (; number < quarterPoints; number++) {
159 cplxValue1 = _mm_load_ps(complexVectorPtr);
160 complexVectorPtr += 4;
161
162 cplxValue2 = _mm_load_ps(complexVectorPtr);
163 complexVectorPtr += 4;
164
165 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
166 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
167
168 result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
169
170 result = _mm_sqrt_ps(result);
171
172 result = _mm_mul_ps(result, vScalar);
173
174 _mm_store_ps(floatBuffer, result);
175 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
176 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
177 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
178 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
179 }
180
181 number = quarterPoints * 4;
183 magnitudeVector + number, complexVector + number, scalar, num_points - number);
184}
185#endif /* LV_HAVE_SSE3 */
186
187
188#ifdef LV_HAVE_SSE
189#include <xmmintrin.h>
190
191static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
192 const lv_32fc_t* complexVector,
193 const float scalar,
194 unsigned int num_points)
195{
196 unsigned int number = 0;
197 const unsigned int quarterPoints = num_points / 4;
198
199 const float* complexVectorPtr = (const float*)complexVector;
200 int16_t* magnitudeVectorPtr = magnitudeVector;
201
202 __m128 vScalar = _mm_set_ps1(scalar);
203
204 __m128 cplxValue1, cplxValue2, result;
205 __m128 iValue, qValue;
206
207 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
208
209 for (; number < quarterPoints; number++) {
210 cplxValue1 = _mm_load_ps(complexVectorPtr);
211 complexVectorPtr += 4;
212
213 cplxValue2 = _mm_load_ps(complexVectorPtr);
214 complexVectorPtr += 4;
215
216 // Arrange in i1i2i3i4 format
217 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
218 // Arrange in q1q2q3q4 format
219 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
220
221 __m128 iValue2 = _mm_mul_ps(iValue, iValue); // Square the I values
222 __m128 qValue2 = _mm_mul_ps(qValue, qValue); // Square the Q Values
223
224 result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
225
226 result = _mm_sqrt_ps(result);
227
228 result = _mm_mul_ps(result, vScalar);
229
230 _mm_store_ps(floatBuffer, result);
231 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
232 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
233 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
234 *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
235 }
236
237 number = quarterPoints * 4;
239 magnitudeVector + number, complexVector + number, scalar, num_points - number);
240}
241#endif /* LV_HAVE_SSE */
242
243
244#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
245
246#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
247#define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
248
249#include <inttypes.h>
250#include <math.h>
251#include <stdio.h>
252#include <volk/volk_common.h>
253
254#ifdef LV_HAVE_AVX2
255#include <immintrin.h>
256
257static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
258 const lv_32fc_t* complexVector,
259 const float scalar,
260 unsigned int num_points)
261{
262 unsigned int number = 0;
263 const unsigned int eighthPoints = num_points / 8;
264
265 const float* complexVectorPtr = (const float*)complexVector;
266 int16_t* magnitudeVectorPtr = magnitudeVector;
267
268 __m256 vScalar = _mm256_set1_ps(scalar);
269 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
270 __m256 cplxValue1, cplxValue2, result;
271 __m256i resultInt;
272 __m128i resultShort;
273
274 for (; number < eighthPoints; number++) {
275 cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
276 complexVectorPtr += 8;
277
278 cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
279 complexVectorPtr += 8;
280
281 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
282 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
283
284 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
285
286 result = _mm256_sqrt_ps(result);
287
288 result = _mm256_mul_ps(result, vScalar);
289
290 resultInt = _mm256_cvtps_epi32(result);
291 resultInt = _mm256_packs_epi32(resultInt, resultInt);
292 resultInt = _mm256_permutevar8x32_epi32(
293 resultInt, idx); // permute to compensate for shuffling in hadd and packs
294 resultShort = _mm256_extracti128_si256(resultInt, 0);
295 _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
296 magnitudeVectorPtr += 8;
297 }
298
299 number = eighthPoints * 8;
301 magnitudeVector + number, complexVector + number, scalar, num_points - number);
302}
303#endif /* LV_HAVE_AVX2 */
304
305#ifdef LV_HAVE_NEON
306#include <arm_neon.h>
307
308static inline void volk_32fc_s32f_magnitude_16i_neon(int16_t* magnitudeVector,
309 const lv_32fc_t* complexVector,
310 const float scalar,
311 unsigned int num_points)
312{
313 unsigned int number = 0;
314 const unsigned int quarter_points = num_points / 4;
315
316 const float* complexVectorPtr = (const float*)complexVector;
317 int16_t* magnitudeVectorPtr = magnitudeVector;
318 float32x4_t vScalar = vdupq_n_f32(scalar);
319
320 float32x4_t half = vdupq_n_f32(0.5f);
321
322 for (; number < quarter_points; number++) {
323 float32x4x2_t input = vld2q_f32(complexVectorPtr);
324 complexVectorPtr += 8;
325
326 float32x4_t realSquared = vmulq_f32(input.val[0], input.val[0]);
327 float32x4_t imagSquared = vmulq_f32(input.val[1], input.val[1]);
328 float32x4_t sumSquared = vaddq_f32(realSquared, imagSquared);
329
330 /* Use reciprocal square root estimate with Newton-Raphson refinement */
331 float32x4_t rsqrt = vrsqrteq_f32(sumSquared);
332 rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
333 rsqrt = vmulq_f32(rsqrt, vrsqrtsq_f32(vmulq_f32(sumSquared, rsqrt), rsqrt));
334 float32x4_t magnitude = vmulq_f32(sumSquared, rsqrt);
335
336 /* Handle zero case */
337 uint32x4_t zero_mask = vceqq_f32(sumSquared, vdupq_n_f32(0.0f));
338 magnitude = vbslq_f32(zero_mask, sumSquared, magnitude);
339
340 // Magnitude is always non-negative, so just add 0.5 for rounding
341 float32x4_t scaled = vaddq_f32(vmulq_f32(magnitude, vScalar), half);
342 int32x4_t intVal = vcvtq_s32_f32(scaled);
343 int16x4_t shortVal = vqmovn_s32(intVal);
344
345 vst1_s16(magnitudeVectorPtr, shortVal);
346 magnitudeVectorPtr += 4;
347 }
348
349 number = quarter_points * 4;
350 for (; number < num_points; number++) {
351 float real = *complexVectorPtr++;
352 float imag = *complexVectorPtr++;
353 *magnitudeVectorPtr++ =
354 (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
355 }
356}
357#endif /* LV_HAVE_NEON */
358
359#ifdef LV_HAVE_NEONV8
360#include <arm_neon.h>
361
362static inline void volk_32fc_s32f_magnitude_16i_neonv8(int16_t* magnitudeVector,
363 const lv_32fc_t* complexVector,
364 const float scalar,
365 unsigned int num_points)
366{
367 unsigned int number = 0;
368 const unsigned int eighth_points = num_points / 8;
369
370 const float* complexVectorPtr = (const float*)complexVector;
371 int16_t* magnitudeVectorPtr = magnitudeVector;
372 float32x4_t vScalar = vdupq_n_f32(scalar);
373
374 for (; number < eighth_points; number++) {
375 float32x4x2_t input0 = vld2q_f32(complexVectorPtr);
376 float32x4x2_t input1 = vld2q_f32(complexVectorPtr + 8);
377 complexVectorPtr += 16;
378 __VOLK_PREFETCH(complexVectorPtr + 16);
379
380 float32x4_t sumSquared0 = vfmaq_f32(
381 vmulq_f32(input0.val[1], input0.val[1]), input0.val[0], input0.val[0]);
382 float32x4_t sumSquared1 = vfmaq_f32(
383 vmulq_f32(input1.val[1], input1.val[1]), input1.val[0], input1.val[0]);
384
385 float32x4_t magnitude0 = vsqrtq_f32(sumSquared0);
386 float32x4_t magnitude1 = vsqrtq_f32(sumSquared1);
387
388 float32x4_t scaled0 = vmulq_f32(magnitude0, vScalar);
389 float32x4_t scaled1 = vmulq_f32(magnitude1, vScalar);
390
391 int32x4_t intVal0 = vcvtnq_s32_f32(scaled0);
392 int32x4_t intVal1 = vcvtnq_s32_f32(scaled1);
393
394 int16x4_t shortVal0 = vqmovn_s32(intVal0);
395 int16x4_t shortVal1 = vqmovn_s32(intVal1);
396
397 vst1_s16(magnitudeVectorPtr, shortVal0);
398 vst1_s16(magnitudeVectorPtr + 4, shortVal1);
399 magnitudeVectorPtr += 8;
400 }
401
402 number = eighth_points * 8;
403 for (; number < num_points; number++) {
404 float real = *complexVectorPtr++;
405 float imag = *complexVectorPtr++;
406 *magnitudeVectorPtr++ =
407 (int16_t)rintf(scalar * sqrtf((real * real) + (imag * imag)));
408 }
409}
410#endif /* LV_HAVE_NEONV8 */
411
412#ifdef LV_HAVE_RVV
413#include <riscv_vector.h>
414
415static inline void volk_32fc_s32f_magnitude_16i_rvv(int16_t* magnitudeVector,
416 const lv_32fc_t* complexVector,
417 const float scalar,
418 unsigned int num_points)
419{
420 size_t n = num_points;
421 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
422 vl = __riscv_vsetvl_e32m4(n);
423 vuint64m8_t vc = __riscv_vle64_v_u64m8((const uint64_t*)complexVector, vl);
424 vfloat32m4_t vr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 0, vl));
425 vfloat32m4_t vi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vc, 32, vl));
426 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
427 v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
428 __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
429 }
430}
431#endif /*LV_HAVE_RVV*/
432
433#ifdef LV_HAVE_RVVSEG
434#include <riscv_vector.h>
435
436static inline void volk_32fc_s32f_magnitude_16i_rvvseg(int16_t* magnitudeVector,
437 const lv_32fc_t* complexVector,
438 const float scalar,
439 unsigned int num_points)
440{
441 size_t n = num_points;
442 for (size_t vl; n > 0; n -= vl, complexVector += vl, magnitudeVector += vl) {
443 vl = __riscv_vsetvl_e32m4(n);
444 vfloat32m4x2_t vc = __riscv_vlseg2e32_v_f32m4x2((const float*)complexVector, vl);
445 vfloat32m4_t vr = __riscv_vget_f32m4(vc, 0);
446 vfloat32m4_t vi = __riscv_vget_f32m4(vc, 1);
447 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
448 v = __riscv_vfmul(__riscv_vfsqrt(v, vl), scalar, vl);
449 __riscv_vse16(magnitudeVector, __riscv_vfncvt_x(v, vl), vl);
450 }
451}
452#endif /*LV_HAVE_RVVSEG*/
453
454#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */