Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_log2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 * Copyright 2025-2026 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
79
80#ifndef INCLUDED_volk_32f_log2_32f_a_H
81#define INCLUDED_volk_32f_log2_32f_a_H
82
83#include <inttypes.h>
84#include <math.h>
85#include <stdio.h>
86#include <stdlib.h>
87
88#ifdef LV_HAVE_GENERIC
89
90static inline void
91volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
92{
93 float* bPtr = bVector;
94 const float* aPtr = aVector;
95 unsigned int number = 0;
96
97 for (number = 0; number < num_points; number++) {
98 *bPtr++ = log2f_non_ieee(*aPtr++);
99 }
100}
101#endif /* LV_HAVE_GENERIC */
102
103#ifdef LV_HAVE_SSE4_1
104#include <smmintrin.h>
106
107static inline void
108volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
109{
110 float* bPtr = bVector;
111 const float* aPtr = aVector;
112
113 unsigned int number = 0;
114 const unsigned int quarterPoints = num_points / 4;
115
116 const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
117 const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
118 const __m128i one_bits = _mm_set1_epi32(0x3f800000);
119 const __m128i exp_bias = _mm_set1_epi32(127);
120 const __m128 one = _mm_set1_ps(1.0f);
121
122 for (; number < quarterPoints; number++) {
123 __m128 aVal = _mm_loadu_ps(aPtr);
124
125 // Check for special values
126 __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
127 __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
128 __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
129 __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
130 __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
131
132 __m128i aVal_i = _mm_castps_si128(aVal);
133
134 // Extract exponent: (aVal_i & exp_mask) >> 23 - bias
135 __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
136 exp_i = _mm_sub_epi32(exp_i, exp_bias);
137 __m128 exp_f = _mm_cvtepi32_ps(exp_i);
138
139 // Extract mantissa as float in [1, 2)
140 __m128 frac =
141 _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
142
143 // Evaluate degree-6 polynomial
144 __m128 poly = _mm_log2_poly_sse(frac);
145
146 // result = exp + poly * (frac - 1)
147 __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
148
149 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
150 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
151 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
152 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
153
154 _mm_storeu_ps(bPtr, bVal);
155
156 aPtr += 4;
157 bPtr += 4;
158 }
159
160 number = quarterPoints * 4;
161 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
162}
163
164#endif /* LV_HAVE_SSE4_1 for unaligned */
165
166#ifdef LV_HAVE_SSE4_1
167
168static inline void
169volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
170{
171 float* bPtr = bVector;
172 const float* aPtr = aVector;
173
174 unsigned int number = 0;
175 const unsigned int quarterPoints = num_points / 4;
176
177 const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
178 const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
179 const __m128i one_bits = _mm_set1_epi32(0x3f800000);
180 const __m128i exp_bias = _mm_set1_epi32(127);
181 const __m128 one = _mm_set1_ps(1.0f);
182
183 for (; number < quarterPoints; number++) {
184 __m128 aVal = _mm_load_ps(aPtr);
185
186 // Check for special values
187 __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
188 __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
189 __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
190 __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
191 __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
192
193 __m128i aVal_i = _mm_castps_si128(aVal);
194
195 // Extract exponent: (aVal_i & exp_mask) >> 23 - bias
196 __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
197 exp_i = _mm_sub_epi32(exp_i, exp_bias);
198 __m128 exp_f = _mm_cvtepi32_ps(exp_i);
199
200 // Extract mantissa as float in [1, 2)
201 __m128 frac =
202 _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
203
204 // Evaluate degree-6 polynomial
205 __m128 poly = _mm_log2_poly_sse(frac);
206
207 // result = exp + poly * (frac - 1)
208 __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
209
210 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
211 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
212 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
213 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
214
215 _mm_store_ps(bPtr, bVal);
216
217 aPtr += 4;
218 bPtr += 4;
219 }
220
221 number = quarterPoints * 4;
222 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
223}
224
225#endif /* LV_HAVE_SSE4_1 */
226
227#ifdef LV_HAVE_AVX2
228#include <immintrin.h>
230
231static inline void
232volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
233{
234 float* bPtr = bVector;
235 const float* aPtr = aVector;
236
237 unsigned int number = 0;
238 const unsigned int eighthPoints = num_points / 8;
239
240 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
241 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
242 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
243 const __m256i exp_bias = _mm256_set1_epi32(127);
244 const __m256 one = _mm256_set1_ps(1.0f);
245
246 for (; number < eighthPoints; number++) {
247 __m256 aVal = _mm256_loadu_ps(aPtr);
248
249 // Check for special values
250 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
251 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
252 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
253 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
254 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
255
256 __m256i aVal_i = _mm256_castps_si256(aVal);
257
258 // Extract exponent
259 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
260 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
261 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
262
263 // Extract mantissa as float in [1, 2)
264 __m256 frac = _mm256_castsi256_ps(
265 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
266
267 // Evaluate degree-6 polynomial
268 __m256 poly = _mm256_log2_poly_avx2(frac);
269
270 // result = exp + poly * (frac - 1)
271 __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
272
273 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
274 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
275 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
276 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
277
278 _mm256_storeu_ps(bPtr, bVal);
279
280 aPtr += 8;
281 bPtr += 8;
282 }
283
284 number = eighthPoints * 8;
285 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
286}
287
288#endif /* LV_HAVE_AVX2 for unaligned */
289
290#ifdef LV_HAVE_AVX2
291
292static inline void
293volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
294{
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
297
298 unsigned int number = 0;
299 const unsigned int eighthPoints = num_points / 8;
300
301 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
302 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
303 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
304 const __m256i exp_bias = _mm256_set1_epi32(127);
305 const __m256 one = _mm256_set1_ps(1.0f);
306
307 for (; number < eighthPoints; number++) {
308 __m256 aVal = _mm256_load_ps(aPtr);
309
310 // Check for special values
311 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
312 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
313 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
314 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
315 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
316
317 __m256i aVal_i = _mm256_castps_si256(aVal);
318
319 // Extract exponent
320 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
321 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
322 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
323
324 // Extract mantissa as float in [1, 2)
325 __m256 frac = _mm256_castsi256_ps(
326 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
327
328 // Evaluate degree-6 polynomial
329 __m256 poly = _mm256_log2_poly_avx2(frac);
330
331 // result = exp + poly * (frac - 1)
332 __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
333
334 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
335 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
336 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
337 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
338
339 _mm256_store_ps(bPtr, bVal);
340
341 aPtr += 8;
342 bPtr += 8;
343 }
344
345 number = eighthPoints * 8;
346 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
347}
348
349#endif /* LV_HAVE_AVX2 */
350
351#if LV_HAVE_AVX2 && LV_HAVE_FMA
352#include <immintrin.h>
354
355static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
356 const float* aVector,
357 unsigned int num_points)
358{
359 float* bPtr = bVector;
360 const float* aPtr = aVector;
361
362 unsigned int number = 0;
363 const unsigned int eighthPoints = num_points / 8;
364
365 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
366 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
367 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
368 const __m256i exp_bias = _mm256_set1_epi32(127);
369 const __m256 one = _mm256_set1_ps(1.0f);
370
371 for (; number < eighthPoints; number++) {
372 __m256 aVal = _mm256_loadu_ps(aPtr);
373
374 // Check for special values
375 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
376 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
377 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
378 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
379 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
380
381 __m256i aVal_i = _mm256_castps_si256(aVal);
382
383 // Extract exponent
384 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
385 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
386 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
387
388 // Extract mantissa as float in [1, 2)
389 __m256 frac = _mm256_castsi256_ps(
390 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
391
392 // Evaluate degree-6 polynomial with FMA
393 __m256 poly = _mm256_log2_poly_avx2_fma(frac);
394
395 // result = exp + poly * (frac - 1)
396 __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
397
398 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
399 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
400 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
401 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
402
403 _mm256_storeu_ps(bPtr, bVal);
404
405 aPtr += 8;
406 bPtr += 8;
407 }
408
409 number = eighthPoints * 8;
410 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
411}
412
413#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
414
415#if LV_HAVE_AVX2 && LV_HAVE_FMA
416
417static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
418 const float* aVector,
419 unsigned int num_points)
420{
421 float* bPtr = bVector;
422 const float* aPtr = aVector;
423
424 unsigned int number = 0;
425 const unsigned int eighthPoints = num_points / 8;
426
427 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
428 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
429 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
430 const __m256i exp_bias = _mm256_set1_epi32(127);
431 const __m256 one = _mm256_set1_ps(1.0f);
432
433 for (; number < eighthPoints; number++) {
434 __m256 aVal = _mm256_load_ps(aPtr);
435
436 // Check for special values
437 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
438 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
439 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
440 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
441 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
442
443 __m256i aVal_i = _mm256_castps_si256(aVal);
444
445 // Extract exponent
446 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
447 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
448 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
449
450 // Extract mantissa as float in [1, 2)
451 __m256 frac = _mm256_castsi256_ps(
452 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
453
454 // Evaluate degree-6 polynomial with FMA
455 __m256 poly = _mm256_log2_poly_avx2_fma(frac);
456
457 // result = exp + poly * (frac - 1)
458 __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
459
460 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
461 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
462 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
463 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
464
465 _mm256_store_ps(bPtr, bVal);
466
467 aPtr += 8;
468 bPtr += 8;
469 }
470
471 number = eighthPoints * 8;
472 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
473}
474
475#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
476
477#ifdef LV_HAVE_AVX512F
478#include <immintrin.h>
480
481static inline void
482volk_32f_log2_32f_u_avx512(float* bVector, const float* aVector, unsigned int num_points)
483{
484 float* bPtr = bVector;
485 const float* aPtr = aVector;
486
487 unsigned int number = 0;
488 const unsigned int sixteenthPoints = num_points / 16;
489
490 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
491 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
492 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
493 const __m512i exp_bias = _mm512_set1_epi32(127);
494 const __m512 one = _mm512_set1_ps(1.0f);
495
496 for (; number < sixteenthPoints; number++) {
497 __m512 aVal = _mm512_loadu_ps(aPtr);
498
499 // Check for special values
500 __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
501 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
502 __mmask16 inf_mask =
503 _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
504 __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
505 __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
506
507 __m512i aVal_i = _mm512_castps_si512(aVal);
508
509 // Extract exponent
510 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
511 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
512 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
513
514 // Extract mantissa as float in [1, 2)
515 __m512 frac = _mm512_castsi512_ps(
516 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
517
518 // Evaluate degree-6 polynomial with FMA
519 __m512 poly = _mm512_log2_poly_avx512(frac);
520
521 // result = exp + poly * (frac - 1)
522 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
523
524 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
525 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
526 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
527 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
528
529 _mm512_storeu_ps(bPtr, bVal);
530
531 aPtr += 16;
532 bPtr += 16;
533 }
534
535 number = sixteenthPoints * 16;
536 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
537}
538
539#endif /* LV_HAVE_AVX512F for unaligned */
540
541#ifdef LV_HAVE_AVX512F
542
543static inline void
544volk_32f_log2_32f_a_avx512(float* bVector, const float* aVector, unsigned int num_points)
545{
546 float* bPtr = bVector;
547 const float* aPtr = aVector;
548
549 unsigned int number = 0;
550 const unsigned int sixteenthPoints = num_points / 16;
551
552 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
553 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
554 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
555 const __m512i exp_bias = _mm512_set1_epi32(127);
556 const __m512 one = _mm512_set1_ps(1.0f);
557
558 for (; number < sixteenthPoints; number++) {
559 __m512 aVal = _mm512_load_ps(aPtr);
560
561 // Check for special values
562 __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
563 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
564 __mmask16 inf_mask =
565 _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
566 __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
567 __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
568
569 __m512i aVal_i = _mm512_castps_si512(aVal);
570
571 // Extract exponent
572 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
573 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
574 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
575
576 // Extract mantissa as float in [1, 2)
577 __m512 frac = _mm512_castsi512_ps(
578 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
579
580 // Evaluate degree-6 polynomial with FMA
581 __m512 poly = _mm512_log2_poly_avx512(frac);
582
583 // result = exp + poly * (frac - 1)
584 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
585
586 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
587 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
588 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
589 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
590
591 _mm512_store_ps(bPtr, bVal);
592
593 aPtr += 16;
594 bPtr += 16;
595 }
596
597 number = sixteenthPoints * 16;
598 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
599}
600
601#endif /* LV_HAVE_AVX512F */
602
603#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
604
605static inline void volk_32f_log2_32f_u_avx512dq(float* bVector,
606 const float* aVector,
607 unsigned int num_points)
608{
609 float* bPtr = bVector;
610 const float* aPtr = aVector;
611
612 unsigned int number = 0;
613 const unsigned int sixteenthPoints = num_points / 16;
614
615 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
616 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
617 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
618 const __m512i exp_bias = _mm512_set1_epi32(127);
619 const __m512 one = _mm512_set1_ps(1.0f);
620
621 for (; number < sixteenthPoints; number++) {
622 __m512 aVal = _mm512_loadu_ps(aPtr);
623
624 // Use fpclass for special value detection (AVX512DQ feature)
625 // 0x01 = QNaN, 0x02 = +0, 0x04 = -0, 0x08 = +Inf, 0x10 = -Inf, 0x80 = SNaN
626 __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81); // NaN (QNaN | SNaN)
627 __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06); // Zero (+0 | -0)
628 __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08); // +Inf only
629 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
630 __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask); // neg or NaN -> NaN
631
632 __m512i aVal_i = _mm512_castps_si512(aVal);
633
634 // Extract exponent
635 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
636 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
637 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
638
639 // Extract mantissa as float in [1, 2)
640 __m512 frac = _mm512_castsi512_ps(
641 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
642
643 // Evaluate degree-6 polynomial with FMA
644 __m512 poly = _mm512_log2_poly_avx512(frac);
645
646 // result = exp + poly * (frac - 1)
647 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
648
649 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
650 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
651 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
652 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
653
654 _mm512_storeu_ps(bPtr, bVal);
655
656 aPtr += 16;
657 bPtr += 16;
658 }
659
660 number = sixteenthPoints * 16;
661 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
662}
663
664#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ for unaligned */
665
666#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
667
668static inline void volk_32f_log2_32f_a_avx512dq(float* bVector,
669 const float* aVector,
670 unsigned int num_points)
671{
672 float* bPtr = bVector;
673 const float* aPtr = aVector;
674
675 unsigned int number = 0;
676 const unsigned int sixteenthPoints = num_points / 16;
677
678 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
679 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
680 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
681 const __m512i exp_bias = _mm512_set1_epi32(127);
682 const __m512 one = _mm512_set1_ps(1.0f);
683
684 for (; number < sixteenthPoints; number++) {
685 __m512 aVal = _mm512_load_ps(aPtr);
686
687 // Use fpclass for special value detection (AVX512DQ feature)
688 // 0x01 = QNaN, 0x02 = +0, 0x04 = -0, 0x08 = +Inf, 0x10 = -Inf, 0x80 = SNaN
689 __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81); // NaN (QNaN | SNaN)
690 __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06); // Zero (+0 | -0)
691 __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08); // +Inf only
692 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
693 __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask); // neg or NaN -> NaN
694
695 __m512i aVal_i = _mm512_castps_si512(aVal);
696
697 // Extract exponent
698 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
699 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
700 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
701
702 // Extract mantissa as float in [1, 2)
703 __m512 frac = _mm512_castsi512_ps(
704 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
705
706 // Evaluate degree-6 polynomial with FMA
707 __m512 poly = _mm512_log2_poly_avx512(frac);
708
709 // result = exp + poly * (frac - 1)
710 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
711
712 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
713 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
714 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
715 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
716
717 _mm512_store_ps(bPtr, bVal);
718
719 aPtr += 16;
720 bPtr += 16;
721 }
722
723 number = sixteenthPoints * 16;
724 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
725}
726
727#endif /* LV_HAVE_AVX512F && LV_HAVE_AVX512DQ */
728
729#ifdef LV_HAVE_NEON
730#include <arm_neon.h>
732
733static inline void
734volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
735{
736 float* bPtr = bVector;
737 const float* aPtr = aVector;
738 unsigned int number;
739 const unsigned int quarterPoints = num_points / 4;
740
741 const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
742 const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
743 const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
744 const int32x4_t exp_bias = vdupq_n_s32(127);
745 const float32x4_t one = vdupq_n_f32(1.0f);
746 const float32x4_t zero = vdupq_n_f32(0.0f);
747 const float32x4_t inf_val = vdupq_n_f32(INFINITY);
748 const float32x4_t nan_val = vdupq_n_f32(NAN);
749 const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
750 const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
751
752 for (number = 0; number < quarterPoints; ++number) {
753 float32x4_t aVal = vld1q_f32(aPtr);
754
755 // Check for special values
756 uint32x4_t neg_mask = vcltq_f32(aVal, zero);
757 uint32x4_t zero_mask = vceqq_f32(aVal, zero);
758 uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
759 uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
760 uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
761
762 int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
763
764 // Extract exponent
765 int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
766 exp_i = vsubq_s32(exp_i, exp_bias);
767 float32x4_t exp_f = vcvtq_f32_s32(exp_i);
768
769 // Extract mantissa as float in [1, 2)
770 int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
771 float32x4_t frac = vreinterpretq_f32_s32(frac_i);
772
773 // Evaluate degree-6 polynomial
774 float32x4_t poly = _vlog2_poly_f32(frac);
775
776 // result = exp + poly * (frac - 1)
777 float32x4_t bVal = vaddq_f32(exp_f, vmulq_f32(poly, vsubq_f32(frac, one)));
778
779 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
780 bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
781 bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
782 bVal = vbslq_f32(invalid_mask, nan_val, bVal);
783
784 vst1q_f32(bPtr, bVal);
785
786 aPtr += 4;
787 bPtr += 4;
788 }
789
790 number = quarterPoints * 4;
791 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
792}
793
794#endif /* LV_HAVE_NEON */
795
796#ifdef LV_HAVE_NEONV8
797
798static inline void
799volk_32f_log2_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
800{
801 float* bPtr = bVector;
802 const float* aPtr = aVector;
803 unsigned int number;
804 const unsigned int quarterPoints = num_points / 4;
805
806 const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
807 const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
808 const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
809 const int32x4_t exp_bias = vdupq_n_s32(127);
810 const float32x4_t one = vdupq_n_f32(1.0f);
811 const float32x4_t zero = vdupq_n_f32(0.0f);
812 const float32x4_t inf_val = vdupq_n_f32(INFINITY);
813 const float32x4_t nan_val = vdupq_n_f32(NAN);
814 const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
815 const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
816
817 for (number = 0; number < quarterPoints; ++number) {
818 float32x4_t aVal = vld1q_f32(aPtr);
819
820 // Check for special values
821 uint32x4_t neg_mask = vcltq_f32(aVal, zero);
822 uint32x4_t zero_mask = vceqq_f32(aVal, zero);
823 uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
824 uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
825 uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
826
827 int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
828
829 // Extract exponent
830 int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
831 exp_i = vsubq_s32(exp_i, exp_bias);
832 float32x4_t exp_f = vcvtq_f32_s32(exp_i);
833
834 // Extract mantissa as float in [1, 2)
835 int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
836 float32x4_t frac = vreinterpretq_f32_s32(frac_i);
837
838 // Evaluate degree-6 polynomial with FMA
839 float32x4_t poly = _vlog2_poly_neonv8(frac);
840
841 // result = exp + poly * (frac - 1)
842 float32x4_t bVal = vfmaq_f32(exp_f, poly, vsubq_f32(frac, one));
843
844 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
845 bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
846 bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
847 bVal = vbslq_f32(invalid_mask, nan_val, bVal);
848
849 vst1q_f32(bPtr, bVal);
850
851 aPtr += 4;
852 bPtr += 4;
853 }
854
855 number = quarterPoints * 4;
856 volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
857}
858
859#endif /* LV_HAVE_NEONV8 */
860
861
862#endif /* INCLUDED_volk_32f_log2_32f_a_H */
863
864#ifndef INCLUDED_volk_32f_log2_32f_u_H
865#define INCLUDED_volk_32f_log2_32f_u_H
866
867#ifdef LV_HAVE_RVV
868#include <riscv_vector.h>
870
871static inline void
872volk_32f_log2_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
873{
874 size_t vlmax = __riscv_vsetvlmax_e32m2();
875
876 const vfloat32m2_t one = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
877 const vint32m2_t one_bits = __riscv_vreinterpret_i32m2(one);
878 const vint32m2_t mant_mask = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
879 const vint32m2_t exp_bias = __riscv_vmv_v_x_i32m2(127, vlmax);
880
881 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
882 const vfloat32m2_t inf_val = __riscv_vfmv_v_f_f32m2(INFINITY, vlmax);
883 const vfloat32m2_t nan_val = __riscv_vfmv_v_f_f32m2(NAN, vlmax);
884 const vfloat32m2_t neg_inf_out = __riscv_vfmv_v_f_f32m2(-127.0f, vlmax);
885 const vfloat32m2_t pos_inf_out = __riscv_vfmv_v_f_f32m2(127.0f, vlmax);
886
887 size_t n = num_points;
888 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
889 vl = __riscv_vsetvl_e32m2(n);
890 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
891
892 // Check for special values
893 vbool16_t zero_mask = __riscv_vmfeq(v, zero, vl);
894 vbool16_t neg_mask = __riscv_vmflt(v, zero, vl);
895 vbool16_t inf_mask = __riscv_vmfeq(v, inf_val, vl);
896 vbool16_t nan_mask = __riscv_vmfne(v, v, vl);
897 vbool16_t invalid_mask = __riscv_vmor(neg_mask, nan_mask, vl);
898
899 vfloat32m2_t a = __riscv_vfabs(v, vl);
900 vfloat32m2_t exp_f = __riscv_vfcvt_f(
901 __riscv_vsub(
902 __riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), exp_bias, vl),
903 vl);
904 vfloat32m2_t frac = __riscv_vreinterpret_f32m2(__riscv_vor(
905 __riscv_vand(__riscv_vreinterpret_i32m2(v), mant_mask, vl), one_bits, vl));
906
907 // Evaluate degree-6 polynomial with FMA
908 vfloat32m2_t poly = __riscv_vlog2_poly_f32m2(frac, vl, vlmax);
909
910 // result = exp + poly * (frac - 1)
911 vfloat32m2_t result =
912 __riscv_vfmacc(exp_f, poly, __riscv_vfsub(frac, one, vl), vl);
913
914 // Replace special values: zero → -127, inf → 127, neg/NaN → NaN
915 result = __riscv_vmerge(result, neg_inf_out, zero_mask, vl);
916 result = __riscv_vmerge(result, pos_inf_out, inf_mask, vl);
917 result = __riscv_vmerge(result, nan_val, invalid_mask, vl);
918
919 __riscv_vse32(bVector, result, vl);
920 }
921}
922#endif /*LV_HAVE_RVV*/
923
924#endif /* INCLUDED_volk_32f_log2_32f_u_H */