80#ifndef INCLUDED_volk_32f_log2_32f_a_H
81#define INCLUDED_volk_32f_log2_32f_a_H
93 float* bPtr = bVector;
94 const float* aPtr = aVector;
95 unsigned int number = 0;
97 for (number = 0; number < num_points; number++) {
104#include <smmintrin.h>
108volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
110 float* bPtr = bVector;
111 const float* aPtr = aVector;
113 unsigned int number = 0;
114 const unsigned int quarterPoints = num_points / 4;
116 const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
117 const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
118 const __m128i one_bits = _mm_set1_epi32(0x3f800000);
119 const __m128i exp_bias = _mm_set1_epi32(127);
120 const __m128 one = _mm_set1_ps(1.0f);
122 for (; number < quarterPoints; number++) {
123 __m128 aVal = _mm_loadu_ps(aPtr);
126 __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
127 __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
128 __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
129 __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
130 __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
132 __m128i aVal_i = _mm_castps_si128(aVal);
135 __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
136 exp_i = _mm_sub_epi32(exp_i, exp_bias);
137 __m128 exp_f = _mm_cvtepi32_ps(exp_i);
141 _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
147 __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
150 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
151 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
152 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
154 _mm_storeu_ps(bPtr, bVal);
160 number = quarterPoints * 4;
169volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
171 float* bPtr = bVector;
172 const float* aPtr = aVector;
174 unsigned int number = 0;
175 const unsigned int quarterPoints = num_points / 4;
177 const __m128i exp_mask = _mm_set1_epi32(0x7f800000);
178 const __m128i mant_mask = _mm_set1_epi32(0x007fffff);
179 const __m128i one_bits = _mm_set1_epi32(0x3f800000);
180 const __m128i exp_bias = _mm_set1_epi32(127);
181 const __m128 one = _mm_set1_ps(1.0f);
183 for (; number < quarterPoints; number++) {
184 __m128 aVal = _mm_load_ps(aPtr);
187 __m128 zero_mask = _mm_cmpeq_ps(aVal, _mm_setzero_ps());
188 __m128 neg_mask = _mm_cmplt_ps(aVal, _mm_setzero_ps());
189 __m128 inf_mask = _mm_cmpeq_ps(aVal, _mm_set1_ps(INFINITY));
190 __m128 nan_mask = _mm_cmpunord_ps(aVal, aVal);
191 __m128 invalid_mask = _mm_or_ps(neg_mask, nan_mask);
193 __m128i aVal_i = _mm_castps_si128(aVal);
196 __m128i exp_i = _mm_srli_epi32(_mm_and_si128(aVal_i, exp_mask), 23);
197 exp_i = _mm_sub_epi32(exp_i, exp_bias);
198 __m128 exp_f = _mm_cvtepi32_ps(exp_i);
202 _mm_castsi128_ps(_mm_or_si128(_mm_and_si128(aVal_i, mant_mask), one_bits));
208 __m128 bVal = _mm_add_ps(exp_f, _mm_mul_ps(poly, _mm_sub_ps(frac, one)));
211 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(-127.0f), zero_mask);
212 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(127.0f), inf_mask);
213 bVal = _mm_blendv_ps(bVal, _mm_set1_ps(NAN), invalid_mask);
215 _mm_store_ps(bPtr, bVal);
221 number = quarterPoints * 4;
228#include <immintrin.h>
232volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
234 float* bPtr = bVector;
235 const float* aPtr = aVector;
237 unsigned int number = 0;
238 const unsigned int eighthPoints = num_points / 8;
240 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
241 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
242 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
243 const __m256i exp_bias = _mm256_set1_epi32(127);
244 const __m256 one = _mm256_set1_ps(1.0f);
246 for (; number < eighthPoints; number++) {
247 __m256 aVal = _mm256_loadu_ps(aPtr);
250 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
251 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
252 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
253 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
254 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
256 __m256i aVal_i = _mm256_castps_si256(aVal);
259 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
260 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
261 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
264 __m256 frac = _mm256_castsi256_ps(
265 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
271 __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
274 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
275 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
276 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
278 _mm256_storeu_ps(bPtr, bVal);
284 number = eighthPoints * 8;
293volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
298 unsigned int number = 0;
299 const unsigned int eighthPoints = num_points / 8;
301 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
302 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
303 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
304 const __m256i exp_bias = _mm256_set1_epi32(127);
305 const __m256 one = _mm256_set1_ps(1.0f);
307 for (; number < eighthPoints; number++) {
308 __m256 aVal = _mm256_load_ps(aPtr);
311 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
312 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
313 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
314 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
315 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
317 __m256i aVal_i = _mm256_castps_si256(aVal);
320 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
321 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
322 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
325 __m256 frac = _mm256_castsi256_ps(
326 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
332 __m256 bVal = _mm256_add_ps(exp_f, _mm256_mul_ps(poly, _mm256_sub_ps(frac, one)));
335 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
336 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
337 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
339 _mm256_store_ps(bPtr, bVal);
345 number = eighthPoints * 8;
351#if LV_HAVE_AVX2 && LV_HAVE_FMA
352#include <immintrin.h>
355static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
356 const float* aVector,
357 unsigned int num_points)
359 float* bPtr = bVector;
360 const float* aPtr = aVector;
362 unsigned int number = 0;
363 const unsigned int eighthPoints = num_points / 8;
365 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
366 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
367 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
368 const __m256i exp_bias = _mm256_set1_epi32(127);
369 const __m256 one = _mm256_set1_ps(1.0f);
371 for (; number < eighthPoints; number++) {
372 __m256 aVal = _mm256_loadu_ps(aPtr);
375 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
376 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
377 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
378 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
379 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
381 __m256i aVal_i = _mm256_castps_si256(aVal);
384 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
385 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
386 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
389 __m256 frac = _mm256_castsi256_ps(
390 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
396 __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
399 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
400 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
401 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
403 _mm256_storeu_ps(bPtr, bVal);
409 number = eighthPoints * 8;
415#if LV_HAVE_AVX2 && LV_HAVE_FMA
417static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
418 const float* aVector,
419 unsigned int num_points)
421 float* bPtr = bVector;
422 const float* aPtr = aVector;
424 unsigned int number = 0;
425 const unsigned int eighthPoints = num_points / 8;
427 const __m256i exp_mask = _mm256_set1_epi32(0x7f800000);
428 const __m256i mant_mask = _mm256_set1_epi32(0x007fffff);
429 const __m256i one_bits = _mm256_set1_epi32(0x3f800000);
430 const __m256i exp_bias = _mm256_set1_epi32(127);
431 const __m256 one = _mm256_set1_ps(1.0f);
433 for (; number < eighthPoints; number++) {
434 __m256 aVal = _mm256_load_ps(aPtr);
437 __m256 zero_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_EQ_OQ);
438 __m256 neg_mask = _mm256_cmp_ps(aVal, _mm256_setzero_ps(), _CMP_LT_OQ);
439 __m256 inf_mask = _mm256_cmp_ps(aVal, _mm256_set1_ps(INFINITY), _CMP_EQ_OQ);
440 __m256 nan_mask = _mm256_cmp_ps(aVal, aVal, _CMP_UNORD_Q);
441 __m256 invalid_mask = _mm256_or_ps(neg_mask, nan_mask);
443 __m256i aVal_i = _mm256_castps_si256(aVal);
446 __m256i exp_i = _mm256_srli_epi32(_mm256_and_si256(aVal_i, exp_mask), 23);
447 exp_i = _mm256_sub_epi32(exp_i, exp_bias);
448 __m256 exp_f = _mm256_cvtepi32_ps(exp_i);
451 __m256 frac = _mm256_castsi256_ps(
452 _mm256_or_si256(_mm256_and_si256(aVal_i, mant_mask), one_bits));
458 __m256 bVal = _mm256_fmadd_ps(poly, _mm256_sub_ps(frac, one), exp_f);
461 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(-127.0f), zero_mask);
462 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(127.0f), inf_mask);
463 bVal = _mm256_blendv_ps(bVal, _mm256_set1_ps(NAN), invalid_mask);
465 _mm256_store_ps(bPtr, bVal);
471 number = eighthPoints * 8;
477#ifdef LV_HAVE_AVX512F
478#include <immintrin.h>
482volk_32f_log2_32f_u_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
484 float* bPtr = bVector;
485 const float* aPtr = aVector;
487 unsigned int number = 0;
488 const unsigned int sixteenthPoints = num_points / 16;
490 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
491 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
492 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
493 const __m512i exp_bias = _mm512_set1_epi32(127);
494 const __m512 one = _mm512_set1_ps(1.0f);
496 for (; number < sixteenthPoints; number++) {
497 __m512 aVal = _mm512_loadu_ps(aPtr);
500 __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
501 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
503 _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
504 __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
505 __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
507 __m512i aVal_i = _mm512_castps_si512(aVal);
510 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
511 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
512 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
515 __m512 frac = _mm512_castsi512_ps(
516 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
522 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
525 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
526 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
527 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
529 _mm512_storeu_ps(bPtr, bVal);
535 number = sixteenthPoints * 16;
541#ifdef LV_HAVE_AVX512F
544volk_32f_log2_32f_a_avx512(
float* bVector,
const float* aVector,
unsigned int num_points)
546 float* bPtr = bVector;
547 const float* aPtr = aVector;
549 unsigned int number = 0;
550 const unsigned int sixteenthPoints = num_points / 16;
552 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
553 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
554 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
555 const __m512i exp_bias = _mm512_set1_epi32(127);
556 const __m512 one = _mm512_set1_ps(1.0f);
558 for (; number < sixteenthPoints; number++) {
559 __m512 aVal = _mm512_load_ps(aPtr);
562 __mmask16 zero_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_EQ_OQ);
563 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
565 _mm512_cmp_ps_mask(aVal, _mm512_set1_ps(INFINITY), _CMP_EQ_OQ);
566 __mmask16 nan_mask = _mm512_cmp_ps_mask(aVal, aVal, _CMP_UNORD_Q);
567 __mmask16 invalid_mask = _kor_mask16(neg_mask, nan_mask);
569 __m512i aVal_i = _mm512_castps_si512(aVal);
572 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
573 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
574 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
577 __m512 frac = _mm512_castsi512_ps(
578 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
584 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
587 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
588 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
589 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
591 _mm512_store_ps(bPtr, bVal);
597 number = sixteenthPoints * 16;
603#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
605static inline void volk_32f_log2_32f_u_avx512dq(
float* bVector,
606 const float* aVector,
607 unsigned int num_points)
609 float* bPtr = bVector;
610 const float* aPtr = aVector;
612 unsigned int number = 0;
613 const unsigned int sixteenthPoints = num_points / 16;
615 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
616 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
617 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
618 const __m512i exp_bias = _mm512_set1_epi32(127);
619 const __m512 one = _mm512_set1_ps(1.0f);
621 for (; number < sixteenthPoints; number++) {
622 __m512 aVal = _mm512_loadu_ps(aPtr);
626 __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81);
627 __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06);
628 __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08);
629 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
630 __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask);
632 __m512i aVal_i = _mm512_castps_si512(aVal);
635 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
636 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
637 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
640 __m512 frac = _mm512_castsi512_ps(
641 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
647 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
650 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
651 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
652 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
654 _mm512_storeu_ps(bPtr, bVal);
660 number = sixteenthPoints * 16;
666#if LV_HAVE_AVX512F && LV_HAVE_AVX512DQ
668static inline void volk_32f_log2_32f_a_avx512dq(
float* bVector,
669 const float* aVector,
670 unsigned int num_points)
672 float* bPtr = bVector;
673 const float* aPtr = aVector;
675 unsigned int number = 0;
676 const unsigned int sixteenthPoints = num_points / 16;
678 const __m512i exp_mask = _mm512_set1_epi32(0x7f800000);
679 const __m512i mant_mask = _mm512_set1_epi32(0x007fffff);
680 const __m512i one_bits = _mm512_set1_epi32(0x3f800000);
681 const __m512i exp_bias = _mm512_set1_epi32(127);
682 const __m512 one = _mm512_set1_ps(1.0f);
684 for (; number < sixteenthPoints; number++) {
685 __m512 aVal = _mm512_load_ps(aPtr);
689 __mmask16 nan_mask = _mm512_fpclass_ps_mask(aVal, 0x81);
690 __mmask16 zero_mask = _mm512_fpclass_ps_mask(aVal, 0x06);
691 __mmask16 inf_mask = _mm512_fpclass_ps_mask(aVal, 0x08);
692 __mmask16 neg_mask = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OQ);
693 __mmask16 invalid_mask = _kor_mask16(nan_mask, neg_mask);
695 __m512i aVal_i = _mm512_castps_si512(aVal);
698 __m512i exp_i = _mm512_srli_epi32(_mm512_and_si512(aVal_i, exp_mask), 23);
699 exp_i = _mm512_sub_epi32(exp_i, exp_bias);
700 __m512 exp_f = _mm512_cvtepi32_ps(exp_i);
703 __m512 frac = _mm512_castsi512_ps(
704 _mm512_or_si512(_mm512_and_si512(aVal_i, mant_mask), one_bits));
710 __m512 bVal = _mm512_fmadd_ps(poly, _mm512_sub_ps(frac, one), exp_f);
713 bVal = _mm512_mask_blend_ps(zero_mask, bVal, _mm512_set1_ps(-127.0f));
714 bVal = _mm512_mask_blend_ps(inf_mask, bVal, _mm512_set1_ps(127.0f));
715 bVal = _mm512_mask_blend_ps(invalid_mask, bVal, _mm512_set1_ps(NAN));
717 _mm512_store_ps(bPtr, bVal);
723 number = sixteenthPoints * 16;
736 float* bPtr = bVector;
737 const float* aPtr = aVector;
739 const unsigned int quarterPoints = num_points / 4;
741 const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
742 const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
743 const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
744 const int32x4_t exp_bias = vdupq_n_s32(127);
745 const float32x4_t one = vdupq_n_f32(1.0f);
746 const float32x4_t zero = vdupq_n_f32(0.0f);
747 const float32x4_t inf_val = vdupq_n_f32(INFINITY);
748 const float32x4_t nan_val = vdupq_n_f32(NAN);
749 const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
750 const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
752 for (number = 0; number < quarterPoints; ++number) {
753 float32x4_t aVal = vld1q_f32(aPtr);
756 uint32x4_t neg_mask = vcltq_f32(aVal, zero);
757 uint32x4_t zero_mask = vceqq_f32(aVal, zero);
758 uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
759 uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
760 uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
762 int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
765 int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
766 exp_i = vsubq_s32(exp_i, exp_bias);
767 float32x4_t exp_f = vcvtq_f32_s32(exp_i);
770 int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
771 float32x4_t frac = vreinterpretq_f32_s32(frac_i);
777 float32x4_t bVal = vaddq_f32(exp_f, vmulq_f32(poly, vsubq_f32(frac, one)));
780 bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
781 bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
782 bVal = vbslq_f32(invalid_mask, nan_val, bVal);
784 vst1q_f32(bPtr, bVal);
790 number = quarterPoints * 4;
799volk_32f_log2_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
801 float* bPtr = bVector;
802 const float* aPtr = aVector;
804 const unsigned int quarterPoints = num_points / 4;
806 const int32x4_t exp_mask = vdupq_n_s32(0x7f800000);
807 const int32x4_t mant_mask = vdupq_n_s32(0x007fffff);
808 const int32x4_t one_bits = vdupq_n_s32(0x3f800000);
809 const int32x4_t exp_bias = vdupq_n_s32(127);
810 const float32x4_t one = vdupq_n_f32(1.0f);
811 const float32x4_t zero = vdupq_n_f32(0.0f);
812 const float32x4_t inf_val = vdupq_n_f32(INFINITY);
813 const float32x4_t nan_val = vdupq_n_f32(NAN);
814 const float32x4_t neg_inf_out = vdupq_n_f32(-127.0f);
815 const float32x4_t pos_inf_out = vdupq_n_f32(127.0f);
817 for (number = 0; number < quarterPoints; ++number) {
818 float32x4_t aVal = vld1q_f32(aPtr);
821 uint32x4_t neg_mask = vcltq_f32(aVal, zero);
822 uint32x4_t zero_mask = vceqq_f32(aVal, zero);
823 uint32x4_t inf_mask = vceqq_f32(aVal, inf_val);
824 uint32x4_t nan_mask = vmvnq_u32(vceqq_f32(aVal, aVal));
825 uint32x4_t invalid_mask = vorrq_u32(neg_mask, nan_mask);
827 int32x4_t aVal_i = vreinterpretq_s32_f32(aVal);
830 int32x4_t exp_i = vshrq_n_s32(vandq_s32(aVal_i, exp_mask), 23);
831 exp_i = vsubq_s32(exp_i, exp_bias);
832 float32x4_t exp_f = vcvtq_f32_s32(exp_i);
835 int32x4_t frac_i = vorrq_s32(vandq_s32(aVal_i, mant_mask), one_bits);
836 float32x4_t frac = vreinterpretq_f32_s32(frac_i);
839 float32x4_t poly = _vlog2_poly_neonv8(frac);
842 float32x4_t bVal = vfmaq_f32(exp_f, poly, vsubq_f32(frac, one));
845 bVal = vbslq_f32(zero_mask, neg_inf_out, bVal);
846 bVal = vbslq_f32(inf_mask, pos_inf_out, bVal);
847 bVal = vbslq_f32(invalid_mask, nan_val, bVal);
849 vst1q_f32(bPtr, bVal);
855 number = quarterPoints * 4;
864#ifndef INCLUDED_volk_32f_log2_32f_u_H
865#define INCLUDED_volk_32f_log2_32f_u_H
868#include <riscv_vector.h>
872volk_32f_log2_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
874 size_t vlmax = __riscv_vsetvlmax_e32m2();
876 const vfloat32m2_t one = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
877 const vint32m2_t one_bits = __riscv_vreinterpret_i32m2(one);
878 const vint32m2_t mant_mask = __riscv_vmv_v_x_i32m2(0x7FFFFF, vlmax);
879 const vint32m2_t exp_bias = __riscv_vmv_v_x_i32m2(127, vlmax);
881 const vfloat32m2_t zero = __riscv_vfmv_v_f_f32m2(0.0f, vlmax);
882 const vfloat32m2_t inf_val = __riscv_vfmv_v_f_f32m2(INFINITY, vlmax);
883 const vfloat32m2_t nan_val = __riscv_vfmv_v_f_f32m2(NAN, vlmax);
884 const vfloat32m2_t neg_inf_out = __riscv_vfmv_v_f_f32m2(-127.0f, vlmax);
885 const vfloat32m2_t pos_inf_out = __riscv_vfmv_v_f_f32m2(127.0f, vlmax);
887 size_t n = num_points;
888 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
889 vl = __riscv_vsetvl_e32m2(n);
890 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
893 vbool16_t zero_mask = __riscv_vmfeq(v, zero, vl);
894 vbool16_t neg_mask = __riscv_vmflt(v, zero, vl);
895 vbool16_t inf_mask = __riscv_vmfeq(v, inf_val, vl);
896 vbool16_t nan_mask = __riscv_vmfne(v, v, vl);
897 vbool16_t invalid_mask = __riscv_vmor(neg_mask, nan_mask, vl);
899 vfloat32m2_t a = __riscv_vfabs(v, vl);
900 vfloat32m2_t exp_f = __riscv_vfcvt_f(
902 __riscv_vsra(__riscv_vreinterpret_i32m2(a), 23, vl), exp_bias, vl),
904 vfloat32m2_t frac = __riscv_vreinterpret_f32m2(__riscv_vor(
905 __riscv_vand(__riscv_vreinterpret_i32m2(v), mant_mask, vl), one_bits, vl));
911 vfloat32m2_t result =
912 __riscv_vfmacc(exp_f, poly, __riscv_vfsub(frac, one, vl), vl);
915 result = __riscv_vmerge(result, neg_inf_out, zero_mask, vl);
916 result = __riscv_vmerge(result, pos_inf_out, inf_mask, vl);
917 result = __riscv_vmerge(result, nan_val, invalid_mask, vl);
919 __riscv_vse32(bVector, result, vl);